diff --git "a/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.9.log" "b/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.9.log" new file mode 100644--- /dev/null +++ "b/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.9.log" @@ -0,0 +1,4247 @@ +# Running on gpub001.delta.ncsa.illinois.edu +# Started at Mon Jul 3 22:24:10 CDT 2023 +# SLURMD_NODENAME=gpub001 +# SLURM_CLUSTER_NAME=delta +# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf +# SLURM_CPUS_ON_NODE=64 +# SLURM_CPUS_PER_TASK=64 +# SLURM_EXPORT_ENV=PATH +# SLURM_GET_USER_ENV=1 +# SLURM_GPUS_ON_NODE=4 +# SLURM_GTIDS=0 +# SLURM_JOBID=2121665 +# SLURM_JOB_ACCOUNT=bbjs-delta-gpu +# SLURM_JOB_CPUS_PER_NODE='64(x16)' +# SLURM_JOB_GID=202 +# SLURM_JOB_GPUS=0,1,2,3 +# SLURM_JOB_ID=2121665 +# SLURM_JOB_NAME=exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.log +# SLURM_JOB_NODELIST='gpub[001-002,015-016,022,030-032,059-060,066-067,076-077,079,096]' +# SLURM_JOB_NUM_NODES=16 +# SLURM_JOB_PARTITION=gpuA40x4 +# SLURM_JOB_QOS=bbjs-delta-gpu +# SLURM_JOB_UID=68077 +# SLURM_JOB_USER=peng6 +# SLURM_LOCALID=0 +# SLURM_MEM_PER_NODE=240000 +# SLURM_NNODES=16 +# SLURM_NODEID=0 +# SLURM_NODELIST='gpub[001-002,015-016,022,030-032,059-060,066-067,076-077,079,096]' +# SLURM_NODE_ALIASES='(null)' +# SLURM_OPEN_MODE=a +# SLURM_PRIO_PROCESS=0 +# SLURM_PROCID=0 +# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1 +# SLURM_SUBMIT_HOST=dt-login02.delta.internal.ncsa.edu +# SLURM_TASKS_PER_NODE='1(x16)' +# SLURM_TASK_PID=383686 +# SLURM_TOPOLOGY_ADDR=ss00.ss09.gpub001 +# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node +# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9728:109 +# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_cc45b274-aa68-4d2c-943c-66da258b53f0 +[gpub001:0/64] 2023-07-03 22:27:37,296 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[gpub001:0/64] 2023-07-03 22:27:37,982 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 64 nodes. +[gpub001:0/64] 2023-07-03 22:27:38,012 (s2t:483) INFO: Vocabulary size: 50002 +[gpub001:0/64] 2023-07-03 22:27:54,659 (abs_task:1201) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True +[gpub001:0/64] 2023-07-03 22:27:54,668 (abs_task:1202) INFO: Model structure: +ESPnetS2TModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (specaug): SpecAug( + (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) + (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) + ) + (normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): TransformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=19456, out_features=1024, bias=True) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (12): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (13): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (14): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (15): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (16): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (17): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (18): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (19): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (20): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (21): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (22): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (23): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(50002, 1024) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=1024, out_features=50002, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (12): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (13): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (14): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (15): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (16): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (17): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (18): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (19): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (20): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (21): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (22): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (23): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) + (ctc): CTC( + (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) + (ctc_loss): CTCLoss() + ) +) + +Model summary: + Class Name: ESPnetS2TModel + Total Number of model parameters: 888.51 M + Number of trainable parameters: 888.51 M (100.0%) + Size: 3.55 GB + Type: torch.float32 +[gpub001:0/64] 2023-07-03 22:27:54,668 (abs_task:1205) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.9, 0.98] + capturable: False + eps: 1e-06 + foreach: None + initial_lr: 0.00025 + lr: 2.5e-08 + maximize: False + weight_decay: 0.0 +) +[gpub001:0/64] 2023-07-03 22:27:54,669 (abs_task:1206) INFO: Scheduler: WarmupLR(warmup_steps=10000) +[gpub001:0/64] 2023-07-03 22:27:54,674 (abs_task:1215) INFO: Saving the configuration in exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/config.yaml +[gpub001:0/64] 2023-07-03 22:27:55,384 (abs_task:1272) INFO: Loading pretrained params from /scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v2/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e18_d18_lr5e-4_warmup20k_raw_bpe50000/valid.acc.ave.pth +[gpub001:0/64] 2023-07-03 22:28:02,397 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-03 22:28:02,589 (abs_task:1570) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev/text", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-03 22:28:02,589 (abs_task:1571) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=1012, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpub001:0/64] 2023-07-03 22:28:02,599 (abs_task:1572) INFO: [valid] mini-batch sizes summary: N-batch=1012, mean=128.1, min=128, max=129 +[gpub001:0/64] 2023-07-03 22:28:03,079 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-03 22:28:03,400 (abs_task:1570) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev/text", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-03 22:28:03,400 (abs_task:1571) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=129591, batch_size=1, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpub001:0/64] 2023-07-03 22:28:03,400 (abs_task:1572) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[gpub001:0/64] 2023-07-03 22:28:31,184 (trainer:159) INFO: The training was resumed using exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/checkpoint.pth +gpub001:383774:383774 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:383774:383774 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:383774:383774 [0] NCCL INFO cudaDriverVersion 12010 +NCCL version 2.14.3+cuda11.7 +[gpub001:0/64] 2023-07-03 22:28:36,765 (trainer:284) INFO: 9/100epoch started +[gpub001:0/64] 2023-07-03 22:28:36,808 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/64] 2023-07-03 22:28:58,125 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-03 22:29:02,284 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-03 22:29:02,284 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/64] 2023-07-03 22:29:02,292 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +gpub022:3399536:3399536 [2] NCCL INFO cudaDriverVersion 12010 +gpub022:3399536:3399536 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.122<0> +gpub022:3399536:3399536 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub022:3399536:3399614 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.122<0> +gpub022:3399536:3399614 [2] NCCL INFO Using network IB +gpub022:3399536:3399614 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub022:3399536:3399614 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/-1/-1->18->17 +gpub022:3399536:3399614 [2] NCCL INFO Channel 00/0 : 18[85000] -> 19[c7000] via P2P/IPC +gpub022:3399536:3399614 [2] NCCL INFO Channel 01/0 : 18[85000] -> 19[c7000] via P2P/IPC +gpub022:3399536:3399614 [2] NCCL INFO Connected all rings +gpub022:3399536:3399614 [2] NCCL INFO Channel 00/0 : 18[85000] -> 17[46000] via P2P/IPC +gpub022:3399536:3399614 [2] NCCL INFO Channel 01/0 : 18[85000] -> 17[46000] via P2P/IPC +gpub022:3399536:3399614 [2] NCCL INFO Connected all trees +gpub022:3399536:3399614 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub022:3399536:3399614 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub022:3399536:3399614 [2] NCCL INFO comm 0x93f2210 rank 18 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub022:3399535:3399535 [1] NCCL INFO cudaDriverVersion 12010 +gpub022:3399535:3399535 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.122<0> +gpub022:3399535:3399535 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub022:3399535:3399615 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.122<0> +gpub022:3399535:3399615 [1] NCCL INFO Using network IB +gpub022:3399535:3399615 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub022:3399535:3399615 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] 18/-1/-1->17->16 +gpub022:3399535:3399615 [1] NCCL INFO Channel 00/0 : 17[46000] -> 18[85000] via P2P/IPC +gpub022:3399535:3399615 [1] NCCL INFO Channel 01/0 : 17[46000] -> 18[85000] via P2P/IPC +gpub022:3399535:3399615 [1] NCCL INFO Connected all rings +gpub022:3399535:3399615 [1] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [receive] via NET/IB/0 +gpub022:3399535:3399615 [1] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [send] via NET/IB/0 +gpub002:1756559:1756559 [0] NCCL INFO cudaDriverVersion 12010 +gpub002:1756559:1756559 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.102<0> +gpub002:1756559:1756559 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub002:1756559:1756638 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.102<0> +gpub002:1756559:1756638 [0] NCCL INFO Using network IB +gpub002:1756559:1756638 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub002:1756559:1756638 [0] NCCL INFO Trees [0] 5/-1/-1->4->9 [1] 5/0/-1->4->12 +gpub002:1756559:1756638 [0] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 00/0 : 4[7000] -> 5[46000] via P2P/IPC +gpub002:1756559:1756638 [0] NCCL INFO Channel 01/0 : 4[7000] -> 5[46000] via P2P/IPC +gpub002:1756559:1756638 [0] NCCL INFO Connected all rings +gpub060:1938145:1938145 [2] NCCL INFO cudaDriverVersion 12010 +gpub060:1938145:1938145 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.160<0> +gpub060:1938145:1938145 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub060:1938145:1938218 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.160<0> +gpub060:1938145:1938218 [2] NCCL INFO Using network IB +gpub060:1938145:1938218 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub060:1938145:1938218 [2] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 +gpub060:1938145:1938218 [2] NCCL INFO Channel 00/0 : 38[85000] -> 39[c7000] via P2P/IPC +gpub060:1938145:1938218 [2] NCCL INFO Channel 01/0 : 38[85000] -> 39[c7000] via P2P/IPC +gpub060:1938145:1938218 [2] NCCL INFO Connected all rings +gpub060:1938145:1938218 [2] NCCL INFO Channel 00/0 : 38[85000] -> 37[46000] via P2P/IPC +gpub060:1938145:1938218 [2] NCCL INFO Channel 01/0 : 38[85000] -> 37[46000] via P2P/IPC +gpub022:3399535:3399615 [1] NCCL INFO Channel 00/0 : 17[46000] -> 16[7000] via P2P/IPC +gpub022:3399535:3399615 [1] NCCL INFO Channel 01/0 : 17[46000] -> 16[7000] via P2P/IPC +gpub022:3399535:3399615 [1] NCCL INFO Connected all trees +gpub022:3399535:3399615 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub022:3399535:3399615 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub022:3399535:3399615 [1] NCCL INFO comm 0x4fa312f0 rank 17 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub002:1756559:1756638 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [receive] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [send] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [send] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [receive] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [receive] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [send] via NET/IB/0 +gpub002:1756559:1756638 [0] NCCL INFO Connected all trees +gpub002:1756559:1756638 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub002:1756559:1756638 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub002:1756559:1756638 [0] NCCL INFO comm 0x51930090 rank 4 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub060:1938145:1938218 [2] NCCL INFO Connected all trees +gpub060:1938145:1938218 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub060:1938145:1938218 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub060:1938145:1938218 [2] NCCL INFO comm 0xb591e2d0 rank 38 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub022:3399534:3399534 [0] NCCL INFO cudaDriverVersion 12010 +gpub022:3399534:3399534 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.122<0> +gpub022:3399534:3399534 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub022:3399534:3399616 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.122<0> +gpub022:3399534:3399616 [0] NCCL INFO Using network IB +gpub022:3399534:3399616 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub022:3399534:3399616 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->20 +gpub022:3399534:3399616 [0] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 00/0 : 16[7000] -> 17[46000] via P2P/IPC +gpub022:3399534:3399616 [0] NCCL INFO Channel 01/0 : 16[7000] -> 17[46000] via P2P/IPC +gpub022:3399534:3399616 [0] NCCL INFO Connected all rings +gpub002:1756561:1756561 [2] NCCL INFO cudaDriverVersion 12010 +gpub002:1756561:1756561 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.102<0> +gpub002:1756561:1756561 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub002:1756561:1756637 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.102<0> +gpub002:1756561:1756637 [2] NCCL INFO Using network IB +gpub002:1756561:1756637 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub002:1756561:1756637 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +gpub002:1756561:1756637 [2] NCCL INFO Channel 00/0 : 6[85000] -> 7[c7000] via P2P/IPC +gpub002:1756561:1756637 [2] NCCL INFO Channel 01/0 : 6[85000] -> 7[c7000] via P2P/IPC +gpub002:1756561:1756637 [2] NCCL INFO Connected all rings +gpub002:1756561:1756637 [2] NCCL INFO Channel 00/0 : 6[85000] -> 5[46000] via P2P/IPC +gpub002:1756561:1756637 [2] NCCL INFO Channel 01/0 : 6[85000] -> 5[46000] via P2P/IPC +gpub022:3399534:3399616 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [send] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [send] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [send] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [receive] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [receive] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [receive] via NET/IB/0 +gpub022:3399534:3399616 [0] NCCL INFO Connected all trees +gpub022:3399534:3399616 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub022:3399534:3399616 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub022:3399534:3399616 [0] NCCL INFO comm 0x50711f50 rank 16 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub002:1756561:1756637 [2] NCCL INFO Connected all trees +gpub002:1756561:1756637 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub002:1756561:1756637 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub002:1756561:1756637 [2] NCCL INFO comm 0x51ad54d0 rank 6 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub002:1756562:1756562 [3] NCCL INFO cudaDriverVersion 12010 +gpub002:1756562:1756562 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.102<0> +gpub002:1756562:1756562 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub002:1756562:1756636 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.102<0> +gpub002:1756562:1756636 [3] NCCL INFO Using network IB +gpub002:1756562:1756636 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub002:1756562:1756636 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +gpub002:1756562:1756636 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0 +gpub002:1756562:1756636 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0 +gpub002:1756562:1756636 [3] NCCL INFO Connected all rings +gpub002:1756562:1756636 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 6[85000] via P2P/IPC +gpub002:1756562:1756636 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 6[85000] via P2P/IPC +gpub002:1756562:1756636 [3] NCCL INFO Connected all trees +gpub002:1756562:1756636 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub002:1756562:1756636 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub002:1756562:1756636 [3] NCCL INFO comm 0x9ca8ab90 rank 7 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub002:1756560:1756560 [1] NCCL INFO cudaDriverVersion 12010 +gpub002:1756560:1756560 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.102<0> +gpub002:1756560:1756560 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub002:1756560:1756635 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.102<0> +gpub002:1756560:1756635 [1] NCCL INFO Using network IB +gpub002:1756560:1756635 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub002:1756560:1756635 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/8/-1->5->4 +gpub002:1756560:1756635 [1] NCCL INFO Channel 00/0 : 5[46000] -> 6[85000] via P2P/IPC +gpub002:1756560:1756635 [1] NCCL INFO Channel 01/0 : 5[46000] -> 6[85000] via P2P/IPC +gpub002:1756560:1756635 [1] NCCL INFO Connected all rings +gpub002:1756560:1756635 [1] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [send] via NET/IB/0 +gpub002:1756560:1756635 [1] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [receive] via NET/IB/0 +gpub002:1756560:1756635 [1] NCCL INFO Channel 00/0 : 5[46000] -> 4[7000] via P2P/IPC +gpub002:1756560:1756635 [1] NCCL INFO Channel 01/0 : 5[46000] -> 4[7000] via P2P/IPC +gpub002:1756560:1756635 [1] NCCL INFO Connected all trees +gpub002:1756560:1756635 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub002:1756560:1756635 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub002:1756560:1756635 [1] NCCL INFO comm 0x17829840 rank 5 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub032:3246893:3246893 [1] NCCL INFO cudaDriverVersion 12010 +gpub032:3246893:3246893 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3246893:3246893 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3246893:3246975 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3246893:3246975 [1] NCCL INFO Using network IB +gpub032:3246893:3246975 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub032:3246893:3246975 [1] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/44/-1->29->28 +gpub032:3246893:3246975 [1] NCCL INFO Channel 00/0 : 29[46000] -> 30[85000] via P2P/IPC +gpub032:3246893:3246975 [1] NCCL INFO Channel 01/0 : 29[46000] -> 30[85000] via P2P/IPC +gpub032:3246893:3246975 [1] NCCL INFO Connected all rings +gpub032:3246893:3246975 [1] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [send] via NET/IB/0 +gpub032:3246893:3246975 [1] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [receive] via NET/IB/0 +gpub032:3246893:3246975 [1] NCCL INFO Channel 00/0 : 29[46000] -> 28[7000] via P2P/IPC +gpub032:3246893:3246975 [1] NCCL INFO Channel 01/0 : 29[46000] -> 28[7000] via P2P/IPC +gpub032:3246893:3246975 [1] NCCL INFO Connected all trees +gpub032:3246893:3246975 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3246893:3246975 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3246893:3246975 [1] NCCL INFO comm 0x9a6ad00 rank 29 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub076:3343845:3343845 [2] NCCL INFO cudaDriverVersion 12010 +gpub076:3343845:3343845 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.176<0> +gpub076:3343845:3343845 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub076:3343845:3343918 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.176<0> +gpub076:3343845:3343918 [2] NCCL INFO Using network IB +gpub076:3343845:3343918 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub076:3343845:3343918 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/-1/-1->50->49 +gpub076:3343845:3343918 [2] NCCL INFO Channel 00/0 : 50[85000] -> 51[c7000] via P2P/IPC +gpub076:3343845:3343918 [2] NCCL INFO Channel 01/0 : 50[85000] -> 51[c7000] via P2P/IPC +gpub076:3343845:3343918 [2] NCCL INFO Connected all rings +gpub076:3343845:3343918 [2] NCCL INFO Channel 00/0 : 50[85000] -> 49[46000] via P2P/IPC +gpub076:3343845:3343918 [2] NCCL INFO Channel 01/0 : 50[85000] -> 49[46000] via P2P/IPC +gpub076:3343845:3343918 [2] NCCL INFO Connected all trees +gpub076:3343845:3343918 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub076:3343845:3343918 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub076:3343845:3343918 [2] NCCL INFO comm 0x4fe2ad90 rank 50 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub022:3399537:3399537 [3] NCCL INFO cudaDriverVersion 12010 +gpub022:3399537:3399537 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.122<0> +gpub022:3399537:3399537 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub022:3399537:3399617 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.122<0> +gpub022:3399537:3399617 [3] NCCL INFO Using network IB +gpub022:3399537:3399617 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub022:3399537:3399617 [3] NCCL INFO Trees [0] -1/-1/-1->19->18 [1] -1/-1/-1->19->18 +gpub022:3399537:3399617 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0 +gpub022:3399537:3399617 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0 +gpub022:3399537:3399617 [3] NCCL INFO Connected all rings +gpub022:3399537:3399617 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 18[85000] via P2P/IPC +gpub022:3399537:3399617 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 18[85000] via P2P/IPC +gpub022:3399537:3399617 [3] NCCL INFO Connected all trees +gpub022:3399537:3399617 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub022:3399537:3399617 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub022:3399537:3399617 [3] NCCL INFO comm 0x50214710 rank 19 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub077:252892:252892 [0] NCCL INFO cudaDriverVersion 12010 +gpub077:252892:252892 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.177<0> +gpub077:252892:252892 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub077:252892:252962 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.177<0> +gpub077:252892:252962 [0] NCCL INFO Using network IB +gpub077:252892:252962 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub077:252892:252962 [0] NCCL INFO Trees [0] 53/-1/-1->52->57 [1] 53/48/-1->52->45 +gpub077:252892:252962 [0] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 00/0 : 52[7000] -> 53[46000] via P2P/IPC +gpub077:252892:252962 [0] NCCL INFO Channel 01/0 : 52[7000] -> 53[46000] via P2P/IPC +gpub077:252892:252962 [0] NCCL INFO Connected all rings +gpub077:252892:252962 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [receive] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [send] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [receive] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [send] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [receive] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [send] via NET/IB/0 +gpub077:252892:252962 [0] NCCL INFO Connected all trees +gpub077:252892:252962 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub077:252892:252962 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub077:252892:252962 [0] NCCL INFO comm 0x97aafd0 rank 52 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub015:828881:828881 [3] NCCL INFO cudaDriverVersion 12010 +gpub015:828881:828881 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:828881:828881 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:828881:828953 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:828881:828953 [3] NCCL INFO Using network IB +gpub015:828881:828953 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub015:828881:828953 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] -1/-1/-1->11->10 +gpub015:828881:828953 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0 +gpub015:828881:828953 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0 +gpub015:828881:828953 [3] NCCL INFO Connected all rings +gpub015:828881:828953 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 10[85000] via P2P/IPC +gpub015:828881:828953 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 10[85000] via P2P/IPC +gpub015:828881:828953 [3] NCCL INFO Connected all trees +gpub015:828881:828953 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:828881:828953 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:828881:828953 [3] NCCL INFO comm 0xb64dad10 rank 11 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub076:3343846:3343846 [3] NCCL INFO cudaDriverVersion 12010 +gpub076:3343846:3343846 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.176<0> +gpub076:3343846:3343846 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub076:3343846:3343921 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.176<0> +gpub076:3343846:3343921 [3] NCCL INFO Using network IB +gpub076:3343846:3343921 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub076:3343846:3343921 [3] NCCL INFO Trees [0] -1/-1/-1->51->50 [1] -1/-1/-1->51->50 +gpub076:3343846:3343921 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0 +gpub076:3343846:3343921 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0 +gpub076:3343846:3343921 [3] NCCL INFO Connected all rings +gpub076:3343846:3343921 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 50[85000] via P2P/IPC +gpub076:3343846:3343921 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 50[85000] via P2P/IPC +gpub076:3343846:3343921 [3] NCCL INFO Connected all trees +gpub076:3343846:3343921 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub076:3343846:3343921 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub076:3343846:3343921 [3] NCCL INFO comm 0x50888c10 rank 51 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub067:1390513:1390513 [0] NCCL INFO cudaDriverVersion 12010 +gpub067:1390513:1390513 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1390513:1390513 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1390513:1390587 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1390513:1390587 [0] NCCL INFO Using network IB +gpub067:1390513:1390587 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub067:1390513:1390587 [0] NCCL INFO Trees [0] 45/-1/-1->44->40 [1] 45/36/-1->44->29 +gpub067:1390513:1390587 [0] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 00/0 : 44[7000] -> 45[46000] via P2P/IPC +gpub067:1390513:1390587 [0] NCCL INFO Channel 01/0 : 44[7000] -> 45[46000] via P2P/IPC +gpub067:1390513:1390587 [0] NCCL INFO Connected all rings +gpub067:1390513:1390587 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [receive] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [receive] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [receive] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [send] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [send] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [send] via NET/IB/0 +gpub067:1390513:1390587 [0] NCCL INFO Connected all trees +gpub067:1390513:1390587 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub067:1390513:1390587 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1390513:1390587 [0] NCCL INFO comm 0x4ef73970 rank 44 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub015:828878:828878 [0] NCCL INFO cudaDriverVersion 12010 +gpub015:828878:828878 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:828878:828878 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:828878:828950 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:828878:828950 [0] NCCL INFO Using network IB +gpub015:828878:828950 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub015:828878:828950 [0] NCCL INFO Trees [0] 9/12/-1->8->17 [1] 9/-1/-1->8->5 +gpub015:828878:828950 [0] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 00/0 : 8[7000] -> 9[46000] via P2P/IPC +gpub015:828878:828950 [0] NCCL INFO Channel 01/0 : 8[7000] -> 9[46000] via P2P/IPC +gpub015:828878:828950 [0] NCCL INFO Connected all rings +gpub015:828878:828950 [0] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [receive] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [send] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [send] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [receive] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [receive] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [send] via NET/IB/0 +gpub015:828878:828950 [0] NCCL INFO Connected all trees +gpub015:828878:828950 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:828878:828950 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:828878:828950 [0] NCCL INFO comm 0x8fc63100 rank 8 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub015:828879:828879 [1] NCCL INFO cudaDriverVersion 12010 +gpub015:828879:828879 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:828879:828879 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:828879:828952 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:828879:828952 [1] NCCL INFO Using network IB +gpub015:828879:828952 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub015:828879:828952 [1] NCCL INFO Trees [0] 10/4/-1->9->8 [1] 10/-1/-1->9->8 +gpub015:828879:828952 [1] NCCL INFO Channel 00/0 : 9[46000] -> 10[85000] via P2P/IPC +gpub015:828879:828952 [1] NCCL INFO Channel 01/0 : 9[46000] -> 10[85000] via P2P/IPC +gpub015:828879:828952 [1] NCCL INFO Connected all rings +gpub015:828879:828952 [1] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [receive] via NET/IB/0 +gpub015:828879:828952 [1] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [send] via NET/IB/0 +gpub015:828879:828952 [1] NCCL INFO Channel 00/0 : 9[46000] -> 8[7000] via P2P/IPC +gpub015:828879:828952 [1] NCCL INFO Channel 01/0 : 9[46000] -> 8[7000] via P2P/IPC +gpub015:828879:828952 [1] NCCL INFO Connected all trees +gpub015:828879:828952 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:828879:828952 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:828879:828952 [1] NCCL INFO comm 0x8ad4b90 rank 9 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub076:3343843:3343843 [0] NCCL INFO cudaDriverVersion 12010 +gpub076:3343843:3343843 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.176<0> +gpub076:3343843:3343843 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub076:3343843:3343919 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.176<0> +gpub076:3343843:3343919 [0] NCCL INFO Using network IB +gpub076:3343843:3343919 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub076:3343843:3343919 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->52 +gpub076:3343843:3343919 [0] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 00/0 : 48[7000] -> 49[46000] via P2P/IPC +gpub076:3343843:3343919 [0] NCCL INFO Channel 01/0 : 48[7000] -> 49[46000] via P2P/IPC +gpub076:3343843:3343919 [0] NCCL INFO Connected all rings +gpub076:3343843:3343919 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [send] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [send] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [receive] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [send] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [receive] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [receive] via NET/IB/0 +gpub076:3343843:3343919 [0] NCCL INFO Connected all trees +gpub076:3343843:3343919 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub076:3343843:3343919 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub076:3343843:3343919 [0] NCCL INFO comm 0x508de3f0 rank 48 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub076:3343844:3343844 [1] NCCL INFO cudaDriverVersion 12010 +gpub076:3343844:3343844 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.176<0> +gpub076:3343844:3343844 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub076:3343844:3343920 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.176<0> +gpub076:3343844:3343920 [1] NCCL INFO Using network IB +gpub076:3343844:3343920 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub076:3343844:3343920 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] 50/-1/-1->49->48 +gpub076:3343844:3343920 [1] NCCL INFO Channel 00/0 : 49[46000] -> 50[85000] via P2P/IPC +gpub076:3343844:3343920 [1] NCCL INFO Channel 01/0 : 49[46000] -> 50[85000] via P2P/IPC +gpub076:3343844:3343920 [1] NCCL INFO Connected all rings +gpub076:3343844:3343920 [1] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [receive] via NET/IB/0 +gpub076:3343844:3343920 [1] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [send] via NET/IB/0 +gpub076:3343844:3343920 [1] NCCL INFO Channel 00/0 : 49[46000] -> 48[7000] via P2P/IPC +gpub076:3343844:3343920 [1] NCCL INFO Channel 01/0 : 49[46000] -> 48[7000] via P2P/IPC +gpub076:3343844:3343920 [1] NCCL INFO Connected all trees +gpub076:3343844:3343920 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub076:3343844:3343920 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub076:3343844:3343920 [1] NCCL INFO comm 0xb838ee00 rank 49 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub015:828880:828880 [2] NCCL INFO cudaDriverVersion 12010 +gpub015:828880:828880 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:828880:828880 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:828880:828951 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:828880:828951 [2] NCCL INFO Using network IB +gpub015:828880:828951 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub015:828880:828951 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 +gpub015:828880:828951 [2] NCCL INFO Channel 00/0 : 10[85000] -> 11[c7000] via P2P/IPC +gpub015:828880:828951 [2] NCCL INFO Channel 01/0 : 10[85000] -> 11[c7000] via P2P/IPC +gpub015:828880:828951 [2] NCCL INFO Connected all rings +gpub015:828880:828951 [2] NCCL INFO Channel 00/0 : 10[85000] -> 9[46000] via P2P/IPC +gpub015:828880:828951 [2] NCCL INFO Channel 01/0 : 10[85000] -> 9[46000] via P2P/IPC +gpub015:828880:828951 [2] NCCL INFO Connected all trees +gpub015:828880:828951 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:828880:828951 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:828880:828951 [2] NCCL INFO comm 0x9e67ed0 rank 10 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub001:383775:383775 [1] NCCL INFO cudaDriverVersion 12010 +gpub001:383775:383775 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:383775:383775 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:383775:383855 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:383775:383855 [1] NCCL INFO Using network IB +gpub001:383775:383855 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub001:383775:383855 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +gpub001:383775:383855 [1] NCCL INFO Channel 00/0 : 1[46000] -> 2[85000] via P2P/IPC +gpub001:383775:383855 [1] NCCL INFO Channel 01/0 : 1[46000] -> 2[85000] via P2P/IPC +gpub001:383775:383855 [1] NCCL INFO Connected all rings +gpub001:383775:383855 [1] NCCL INFO Channel 00/0 : 1[46000] -> 0[7000] via P2P/IPC +gpub001:383775:383855 [1] NCCL INFO Channel 01/0 : 1[46000] -> 0[7000] via P2P/IPC +gpub001:383775:383855 [1] NCCL INFO Connected all trees +gpub001:383775:383855 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub001:383775:383855 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:383775:383855 [1] NCCL INFO comm 0x8e376a10 rank 1 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub001:383776:383776 [2] NCCL INFO cudaDriverVersion 12010 +gpub001:383776:383776 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:383776:383776 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:383776:383853 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:383776:383853 [2] NCCL INFO Using network IB +gpub001:383776:383853 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub001:383776:383853 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +gpub001:383776:383853 [2] NCCL INFO Channel 00/0 : 2[85000] -> 3[c7000] via P2P/IPC +gpub001:383776:383853 [2] NCCL INFO Channel 01/0 : 2[85000] -> 3[c7000] via P2P/IPC +gpub001:383776:383853 [2] NCCL INFO Connected all rings +gpub001:383776:383853 [2] NCCL INFO Channel 00/0 : 2[85000] -> 1[46000] via P2P/IPC +gpub001:383776:383853 [2] NCCL INFO Channel 01/0 : 2[85000] -> 1[46000] via P2P/IPC +gpub001:383776:383853 [2] NCCL INFO Connected all trees +gpub079:2616803:2616803 [0] NCCL INFO cudaDriverVersion 12010 +gpub079:2616803:2616803 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2616803:2616803 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2616803:2616883 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2616803:2616883 [0] NCCL INFO Using network IB +gpub079:2616803:2616883 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub079:2616803:2616883 [0] NCCL INFO Trees [0] 57/60/-1->56->48 [1] 57/-1/-1->56->53 +gpub079:2616803:2616883 [0] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 00/0 : 56[7000] -> 57[46000] via P2P/IPC +gpub079:2616803:2616883 [0] NCCL INFO Channel 01/0 : 56[7000] -> 57[46000] via P2P/IPC +gpub079:2616803:2616883 [0] NCCL INFO Connected all rings +gpub001:383776:383853 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub001:383776:383853 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:383776:383853 [2] NCCL INFO comm 0xa0c5f40 rank 2 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub079:2616803:2616883 [0] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [receive] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [send] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [receive] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [send] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [receive] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [send] via NET/IB/0 +gpub079:2616803:2616883 [0] NCCL INFO Connected all trees +gpub079:2616803:2616883 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2616803:2616883 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2616803:2616883 [0] NCCL INFO comm 0xa9779a50 rank 56 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub067:1390515:1390515 [2] NCCL INFO cudaDriverVersion 12010 +gpub067:1390515:1390515 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1390515:1390515 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1390515:1390586 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1390515:1390586 [2] NCCL INFO Using network IB +gpub067:1390515:1390586 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub067:1390515:1390586 [2] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 +gpub067:1390515:1390586 [2] NCCL INFO Channel 00/0 : 46[85000] -> 47[c7000] via P2P/IPC +gpub067:1390515:1390586 [2] NCCL INFO Channel 01/0 : 46[85000] -> 47[c7000] via P2P/IPC +gpub067:1390515:1390586 [2] NCCL INFO Connected all rings +gpub067:1390515:1390586 [2] NCCL INFO Channel 00/0 : 46[85000] -> 45[46000] via P2P/IPC +gpub067:1390515:1390586 [2] NCCL INFO Channel 01/0 : 46[85000] -> 45[46000] via P2P/IPC +gpub067:1390515:1390586 [2] NCCL INFO Connected all trees +gpub067:1390515:1390586 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub067:1390515:1390586 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1390515:1390586 [2] NCCL INFO comm 0x5030f0d0 rank 46 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub067:1390514:1390514 [1] NCCL INFO cudaDriverVersion 12010 +gpub067:1390514:1390514 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1390514:1390514 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1390514:1390588 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1390514:1390588 [1] NCCL INFO Using network IB +gpub067:1390514:1390588 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub067:1390514:1390588 [1] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/52/-1->45->44 +gpub067:1390514:1390588 [1] NCCL INFO Channel 00/0 : 45[46000] -> 46[85000] via P2P/IPC +gpub067:1390514:1390588 [1] NCCL INFO Channel 01/0 : 45[46000] -> 46[85000] via P2P/IPC +gpub067:1390514:1390588 [1] NCCL INFO Connected all rings +gpub067:1390514:1390588 [1] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [send] via NET/IB/0 +gpub067:1390514:1390588 [1] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [receive] via NET/IB/0 +gpub077:252893:252893 [1] NCCL INFO cudaDriverVersion 12010 +gpub077:252893:252893 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.177<0> +gpub077:252893:252893 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub077:252893:252961 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.177<0> +gpub077:252893:252961 [1] NCCL INFO Using network IB +gpub077:252893:252961 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub077:252893:252961 [1] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/56/-1->53->52 +gpub077:252893:252961 [1] NCCL INFO Channel 00/0 : 53[46000] -> 54[85000] via P2P/IPC +gpub077:252893:252961 [1] NCCL INFO Channel 01/0 : 53[46000] -> 54[85000] via P2P/IPC +gpub077:252893:252961 [1] NCCL INFO Connected all rings +gpub077:252893:252961 [1] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [send] via NET/IB/0 +gpub077:252893:252961 [1] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [receive] via NET/IB/0 +gpub067:1390514:1390588 [1] NCCL INFO Channel 00/0 : 45[46000] -> 44[7000] via P2P/IPC +gpub067:1390514:1390588 [1] NCCL INFO Channel 01/0 : 45[46000] -> 44[7000] via P2P/IPC +gpub067:1390514:1390588 [1] NCCL INFO Connected all trees +gpub067:1390514:1390588 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub067:1390514:1390588 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1390514:1390588 [1] NCCL INFO comm 0xa70b75d0 rank 45 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub077:252893:252961 [1] NCCL INFO Channel 00/0 : 53[46000] -> 52[7000] via P2P/IPC +gpub077:252893:252961 [1] NCCL INFO Channel 01/0 : 53[46000] -> 52[7000] via P2P/IPC +gpub077:252893:252961 [1] NCCL INFO Connected all trees +gpub077:252893:252961 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub077:252893:252961 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub077:252893:252961 [1] NCCL INFO comm 0x509e6280 rank 53 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub016:1380823:1380823 [2] NCCL INFO cudaDriverVersion 12010 +gpub016:1380823:1380823 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1380823:1380823 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1380823:1380896 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1380823:1380896 [2] NCCL INFO Using network IB +gpub016:1380823:1380896 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub016:1380823:1380896 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 +gpub016:1380823:1380896 [2] NCCL INFO Channel 00/0 : 14[85000] -> 15[c7000] via P2P/IPC +gpub016:1380823:1380896 [2] NCCL INFO Channel 01/0 : 14[85000] -> 15[c7000] via P2P/IPC +gpub016:1380823:1380896 [2] NCCL INFO Connected all rings +gpub016:1380823:1380896 [2] NCCL INFO Channel 00/0 : 14[85000] -> 13[46000] via P2P/IPC +gpub016:1380823:1380896 [2] NCCL INFO Channel 01/0 : 14[85000] -> 13[46000] via P2P/IPC +gpub016:1380823:1380896 [2] NCCL INFO Connected all trees +gpub016:1380823:1380896 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub016:1380823:1380896 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1380823:1380896 [2] NCCL INFO comm 0x517fee10 rank 14 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub032:3246894:3246894 [2] NCCL INFO cudaDriverVersion 12010 +gpub032:3246894:3246894 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3246894:3246894 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3246894:3246976 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3246894:3246976 [2] NCCL INFO Using network IB +gpub032:3246894:3246976 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub032:3246894:3246976 [2] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 +gpub032:3246894:3246976 [2] NCCL INFO Channel 00/0 : 30[85000] -> 31[c7000] via P2P/IPC +gpub032:3246894:3246976 [2] NCCL INFO Channel 01/0 : 30[85000] -> 31[c7000] via P2P/IPC +gpub032:3246894:3246976 [2] NCCL INFO Connected all rings +gpub032:3246894:3246976 [2] NCCL INFO Channel 00/0 : 30[85000] -> 29[46000] via P2P/IPC +gpub032:3246894:3246976 [2] NCCL INFO Channel 01/0 : 30[85000] -> 29[46000] via P2P/IPC +gpub032:3246894:3246976 [2] NCCL INFO Connected all trees +gpub032:3246894:3246976 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3246894:3246976 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3246894:3246976 [2] NCCL INFO comm 0x9ddee7e0 rank 30 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub001:383777:383777 [3] NCCL INFO cudaDriverVersion 12010 +gpub001:383777:383777 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:383777:383777 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:383777:383854 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:383777:383854 [3] NCCL INFO Using network IB +gpub001:383777:383854 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub001:383777:383854 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +gpub001:383777:383854 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0 +gpub001:383777:383854 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0 +gpub001:383777:383854 [3] NCCL INFO Connected all rings +gpub001:383777:383854 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 2[85000] via P2P/IPC +gpub001:383777:383854 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 2[85000] via P2P/IPC +gpub001:383777:383854 [3] NCCL INFO Connected all trees +gpub001:383777:383854 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub001:383777:383854 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:383777:383854 [3] NCCL INFO comm 0xc243d9d0 rank 3 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub030:2310659:2310659 [2] NCCL INFO cudaDriverVersion 12010 +gpub030:2310659:2310659 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.130<0> +gpub030:2310659:2310659 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub030:2310659:2310728 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.130<0> +gpub030:2310659:2310728 [2] NCCL INFO Using network IB +gpub030:2310659:2310728 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub030:2310659:2310728 [2] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 +gpub030:2310659:2310728 [2] NCCL INFO Channel 00/0 : 22[85000] -> 23[c7000] via P2P/IPC +gpub030:2310659:2310728 [2] NCCL INFO Channel 01/0 : 22[85000] -> 23[c7000] via P2P/IPC +gpub030:2310659:2310728 [2] NCCL INFO Connected all rings +gpub030:2310659:2310728 [2] NCCL INFO Channel 00/0 : 22[85000] -> 21[46000] via P2P/IPC +gpub030:2310659:2310728 [2] NCCL INFO Channel 01/0 : 22[85000] -> 21[46000] via P2P/IPC +gpub030:2310659:2310728 [2] NCCL INFO Connected all trees +gpub030:2310659:2310728 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub030:2310659:2310728 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub030:2310659:2310728 [2] NCCL INFO comm 0x8de12f60 rank 22 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub030:2310657:2310657 [0] NCCL INFO cudaDriverVersion 12010 +gpub030:2310657:2310657 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.130<0> +gpub030:2310657:2310657 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub030:2310657:2310726 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.130<0> +gpub030:2310657:2310726 [0] NCCL INFO Using network IB +gpub030:2310657:2310726 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub030:2310657:2310726 [0] NCCL INFO Trees [0] 21/-1/-1->20->25 [1] 21/16/-1->20->13 +gpub030:2310657:2310726 [0] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 00/0 : 20[7000] -> 21[46000] via P2P/IPC +gpub030:2310657:2310726 [0] NCCL INFO Channel 01/0 : 20[7000] -> 21[46000] via P2P/IPC +gpub030:2310657:2310726 [0] NCCL INFO Connected all rings +gpub060:1938143:1938143 [0] NCCL INFO cudaDriverVersion 12010 +gpub060:1938143:1938143 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.160<0> +gpub060:1938143:1938143 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub060:1938143:1938219 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.160<0> +gpub060:1938143:1938219 [0] NCCL INFO Using network IB +gpub060:1938143:1938219 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub060:1938143:1938219 [0] NCCL INFO Trees [0] 37/-1/-1->36->41 [1] 37/32/-1->36->44 +gpub060:1938143:1938219 [0] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 00/0 : 36[7000] -> 37[46000] via P2P/IPC +gpub060:1938143:1938219 [0] NCCL INFO Channel 01/0 : 36[7000] -> 37[46000] via P2P/IPC +gpub060:1938143:1938219 [0] NCCL INFO Connected all rings +gpub030:2310657:2310726 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [receive] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [send] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [receive] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [send] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [receive] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [send] via NET/IB/0 +gpub030:2310657:2310726 [0] NCCL INFO Connected all trees +gpub030:2310657:2310726 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub030:2310657:2310726 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub030:2310657:2310726 [0] NCCL INFO comm 0x50d929d0 rank 20 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub060:1938143:1938219 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [receive] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [send] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [send] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [receive] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [receive] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [send] via NET/IB/0 +gpub060:1938143:1938219 [0] NCCL INFO Connected all trees +gpub060:1938143:1938219 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub060:1938143:1938219 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub060:1938143:1938219 [0] NCCL INFO comm 0x50561020 rank 36 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub060:1938144:1938144 [1] NCCL INFO cudaDriverVersion 12010 +gpub060:1938144:1938144 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.160<0> +gpub060:1938144:1938144 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub060:1938144:1938217 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.160<0> +gpub060:1938144:1938217 [1] NCCL INFO Using network IB +gpub060:1938144:1938217 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub060:1938144:1938217 [1] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/40/-1->37->36 +gpub060:1938144:1938217 [1] NCCL INFO Channel 00/0 : 37[46000] -> 38[85000] via P2P/IPC +gpub060:1938144:1938217 [1] NCCL INFO Channel 01/0 : 37[46000] -> 38[85000] via P2P/IPC +gpub060:1938144:1938217 [1] NCCL INFO Connected all rings +gpub060:1938144:1938217 [1] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [send] via NET/IB/0 +gpub060:1938144:1938217 [1] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [receive] via NET/IB/0 +gpub060:1938144:1938217 [1] NCCL INFO Channel 00/0 : 37[46000] -> 36[7000] via P2P/IPC +gpub060:1938144:1938217 [1] NCCL INFO Channel 01/0 : 37[46000] -> 36[7000] via P2P/IPC +gpub060:1938144:1938217 [1] NCCL INFO Connected all trees +gpub060:1938144:1938217 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub060:1938144:1938217 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub060:1938144:1938217 [1] NCCL INFO comm 0x4f3bc650 rank 37 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub079:2616806:2616806 [3] NCCL INFO cudaDriverVersion 12010 +gpub079:2616806:2616806 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2616806:2616806 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2616806:2616881 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2616806:2616881 [3] NCCL INFO Using network IB +gpub079:2616806:2616881 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub079:2616806:2616881 [3] NCCL INFO Trees [0] -1/-1/-1->59->58 [1] -1/-1/-1->59->58 +gpub079:2616806:2616881 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0 +gpub079:2616806:2616881 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0 +gpub079:2616806:2616881 [3] NCCL INFO Connected all rings +gpub079:2616806:2616881 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 58[85000] via P2P/IPC +gpub079:2616806:2616881 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 58[85000] via P2P/IPC +gpub079:2616806:2616881 [3] NCCL INFO Connected all trees +gpub079:2616806:2616881 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2616806:2616881 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2616806:2616881 [3] NCCL INFO comm 0x89762f0 rank 59 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub066:1432046:1432046 [1] NCCL INFO cudaDriverVersion 12010 +gpub066:1432046:1432046 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1432046:1432046 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1432046:1432129 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1432046:1432129 [1] NCCL INFO Using network IB +gpub066:1432046:1432129 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub066:1432046:1432129 [1] NCCL INFO Trees [0] 42/36/-1->41->40 [1] 42/-1/-1->41->40 +gpub066:1432046:1432129 [1] NCCL INFO Channel 00/0 : 41[46000] -> 42[85000] via P2P/IPC +gpub066:1432046:1432129 [1] NCCL INFO Channel 01/0 : 41[46000] -> 42[85000] via P2P/IPC +gpub066:1432046:1432129 [1] NCCL INFO Connected all rings +gpub066:1432046:1432129 [1] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [receive] via NET/IB/0 +gpub066:1432046:1432129 [1] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [send] via NET/IB/0 +gpub066:1432046:1432129 [1] NCCL INFO Channel 00/0 : 41[46000] -> 40[7000] via P2P/IPC +gpub066:1432046:1432129 [1] NCCL INFO Channel 01/0 : 41[46000] -> 40[7000] via P2P/IPC +gpub066:1432046:1432129 [1] NCCL INFO Connected all trees +gpub066:1432046:1432129 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub066:1432046:1432129 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1432046:1432129 [1] NCCL INFO comm 0x4fabed20 rank 41 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub066:1432045:1432045 [0] NCCL INFO cudaDriverVersion 12010 +gpub066:1432045:1432045 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1432045:1432045 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1432045:1432128 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1432045:1432128 [0] NCCL INFO Using network IB +gpub066:1432045:1432128 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub066:1432045:1432128 [0] NCCL INFO Trees [0] 41/44/-1->40->49 [1] 41/-1/-1->40->37 +gpub066:1432045:1432128 [0] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 00/0 : 40[7000] -> 41[46000] via P2P/IPC +gpub066:1432045:1432128 [0] NCCL INFO Channel 01/0 : 40[7000] -> 41[46000] via P2P/IPC +gpub066:1432045:1432128 [0] NCCL INFO Connected all rings +gpub066:1432045:1432128 [0] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [receive] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [send] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [send] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [receive] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [receive] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [send] via NET/IB/0 +gpub066:1432045:1432128 [0] NCCL INFO Connected all trees +gpub066:1432045:1432128 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub066:1432045:1432128 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1432045:1432128 [0] NCCL INFO comm 0x50653520 rank 40 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub016:1380824:1380824 [3] NCCL INFO cudaDriverVersion 12010 +gpub016:1380824:1380824 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1380824:1380824 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1380824:1380897 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1380824:1380897 [3] NCCL INFO Using network IB +gpub016:1380824:1380897 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub016:1380824:1380897 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] -1/-1/-1->15->14 +gpub016:1380824:1380897 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0 +gpub016:1380824:1380897 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0 +gpub016:1380824:1380897 [3] NCCL INFO Connected all rings +gpub016:1380824:1380897 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 14[85000] via P2P/IPC +gpub016:1380824:1380897 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 14[85000] via P2P/IPC +gpub016:1380824:1380897 [3] NCCL INFO Connected all trees +gpub016:1380824:1380897 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub016:1380824:1380897 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1380824:1380897 [3] NCCL INFO comm 0x8d241cc0 rank 15 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub031:1878314:1878314 [3] NCCL INFO cudaDriverVersion 12010 +gpub031:1878314:1878314 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1878314:1878314 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1878314:1878391 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1878314:1878391 [3] NCCL INFO Using network IB +gpub031:1878314:1878391 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub031:1878314:1878391 [3] NCCL INFO Trees [0] -1/-1/-1->27->26 [1] -1/-1/-1->27->26 +gpub031:1878314:1878391 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0 +gpub031:1878314:1878391 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0 +gpub031:1878314:1878391 [3] NCCL INFO Connected all rings +gpub031:1878314:1878391 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 26[85000] via P2P/IPC +gpub031:1878314:1878391 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 26[85000] via P2P/IPC +gpub066:1432048:1432048 [3] NCCL INFO cudaDriverVersion 12010 +gpub066:1432048:1432048 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1432048:1432048 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1432048:1432126 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1432048:1432126 [3] NCCL INFO Using network IB +gpub066:1432048:1432126 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub066:1432048:1432126 [3] NCCL INFO Trees [0] -1/-1/-1->43->42 [1] -1/-1/-1->43->42 +gpub066:1432048:1432126 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0 +gpub066:1432048:1432126 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0 +gpub066:1432048:1432126 [3] NCCL INFO Connected all rings +gpub066:1432048:1432126 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 42[85000] via P2P/IPC +gpub066:1432048:1432126 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 42[85000] via P2P/IPC +gpub031:1878314:1878391 [3] NCCL INFO Connected all trees +gpub031:1878314:1878391 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1878314:1878391 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1878314:1878391 [3] NCCL INFO comm 0x511daaa0 rank 27 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub066:1432048:1432126 [3] NCCL INFO Connected all trees +gpub066:1432048:1432126 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub066:1432048:1432126 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1432048:1432126 [3] NCCL INFO comm 0x51126a70 rank 43 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub067:1390516:1390516 [3] NCCL INFO cudaDriverVersion 12010 +gpub067:1390516:1390516 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1390516:1390516 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1390516:1390585 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1390516:1390585 [3] NCCL INFO Using network IB +gpub067:1390516:1390585 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub067:1390516:1390585 [3] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] -1/-1/-1->47->46 +gpub067:1390516:1390585 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0 +gpub067:1390516:1390585 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0 +gpub067:1390516:1390585 [3] NCCL INFO Connected all rings +gpub067:1390516:1390585 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 46[85000] via P2P/IPC +gpub067:1390516:1390585 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 46[85000] via P2P/IPC +gpub067:1390516:1390585 [3] NCCL INFO Connected all trees +gpub067:1390516:1390585 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub067:1390516:1390585 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1390516:1390585 [3] NCCL INFO comm 0x509fc1c0 rank 47 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub031:1878313:1878313 [2] NCCL INFO cudaDriverVersion 12010 +gpub031:1878313:1878313 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1878313:1878313 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1878313:1878389 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1878313:1878389 [2] NCCL INFO Using network IB +gpub031:1878313:1878389 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub031:1878313:1878389 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->25 +gpub031:1878313:1878389 [2] NCCL INFO Channel 00/0 : 26[85000] -> 27[c7000] via P2P/IPC +gpub031:1878313:1878389 [2] NCCL INFO Channel 01/0 : 26[85000] -> 27[c7000] via P2P/IPC +gpub031:1878313:1878389 [2] NCCL INFO Connected all rings +gpub031:1878313:1878389 [2] NCCL INFO Channel 00/0 : 26[85000] -> 25[46000] via P2P/IPC +gpub031:1878313:1878389 [2] NCCL INFO Channel 01/0 : 26[85000] -> 25[46000] via P2P/IPC +gpub031:1878313:1878389 [2] NCCL INFO Connected all trees +gpub031:1878313:1878389 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1878313:1878389 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1878313:1878389 [2] NCCL INFO comm 0xa54f400 rank 26 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub001:383774:383852 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:383774:383852 [0] NCCL INFO Using network IB +gpub001:383774:383852 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub001:383774:383852 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpub001:383774:383852 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpub001:383774:383852 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->4 +gpub001:383774:383852 [0] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:383774:383852 [0] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:383774:383852 [0] NCCL INFO Channel 00/0 : 0[7000] -> 1[46000] via P2P/IPC +gpub001:383774:383852 [0] NCCL INFO Channel 01/0 : 0[7000] -> 1[46000] via P2P/IPC +gpub001:383774:383852 [0] NCCL INFO Connected all rings +gpub001:383774:383852 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [send] via NET/IB/0 +gpub001:383774:383852 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:383774:383852 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [send] via NET/IB/0 +gpub001:383774:383852 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:383774:383852 [0] NCCL INFO Connected all trees +gpub001:383774:383852 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub001:383774:383852 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:383774:383852 [0] NCCL INFO comm 0x9b744f70 rank 0 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub059:1894384:1894384 [1] NCCL INFO cudaDriverVersion 12010 +gpub059:1894384:1894384 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1894384:1894384 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1894384:1894459 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1894384:1894459 [1] NCCL INFO Using network IB +gpub059:1894384:1894459 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub059:1894384:1894459 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] 34/-1/-1->33->32 +gpub059:1894384:1894459 [1] NCCL INFO Channel 00/0 : 33[46000] -> 34[85000] via P2P/IPC +gpub059:1894384:1894459 [1] NCCL INFO Channel 01/0 : 33[46000] -> 34[85000] via P2P/IPC +gpub059:1894384:1894459 [1] NCCL INFO Connected all rings +gpub059:1894384:1894459 [1] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [receive] via NET/IB/0 +gpub059:1894384:1894459 [1] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [send] via NET/IB/0 +gpub059:1894384:1894459 [1] NCCL INFO Channel 00/0 : 33[46000] -> 32[7000] via P2P/IPC +gpub059:1894384:1894459 [1] NCCL INFO Channel 01/0 : 33[46000] -> 32[7000] via P2P/IPC +gpub059:1894384:1894459 [1] NCCL INFO Connected all trees +gpub059:1894384:1894459 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub059:1894384:1894459 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1894384:1894459 [1] NCCL INFO comm 0xb7b49460 rank 33 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub016:1380821:1380821 [0] NCCL INFO cudaDriverVersion 12010 +gpub016:1380821:1380821 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1380821:1380821 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1380821:1380899 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1380821:1380899 [0] NCCL INFO Using network IB +gpub016:1380821:1380899 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub016:1380821:1380899 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] 13/4/-1->12->28 +gpub016:1380821:1380899 [0] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 00/0 : 12[7000] -> 13[46000] via P2P/IPC +gpub016:1380821:1380899 [0] NCCL INFO Channel 01/0 : 12[7000] -> 13[46000] via P2P/IPC +gpub016:1380821:1380899 [0] NCCL INFO Connected all rings +gpub016:1380821:1380899 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [receive] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [receive] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [send] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [receive] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [send] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [send] via NET/IB/0 +gpub016:1380821:1380899 [0] NCCL INFO Connected all trees +gpub016:1380821:1380899 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub016:1380821:1380899 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1380821:1380899 [0] NCCL INFO comm 0x50896990 rank 12 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub077:252894:252894 [2] NCCL INFO cudaDriverVersion 12010 +gpub077:252894:252894 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.177<0> +gpub077:252894:252894 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub077:252894:252964 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.177<0> +gpub077:252894:252964 [2] NCCL INFO Using network IB +gpub077:252894:252964 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub077:252894:252964 [2] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 +gpub077:252894:252964 [2] NCCL INFO Channel 00/0 : 54[85000] -> 55[c7000] via P2P/IPC +gpub077:252894:252964 [2] NCCL INFO Channel 01/0 : 54[85000] -> 55[c7000] via P2P/IPC +gpub077:252894:252964 [2] NCCL INFO Connected all rings +gpub077:252894:252964 [2] NCCL INFO Channel 00/0 : 54[85000] -> 53[46000] via P2P/IPC +gpub077:252894:252964 [2] NCCL INFO Channel 01/0 : 54[85000] -> 53[46000] via P2P/IPC +gpub077:252894:252964 [2] NCCL INFO Connected all trees +gpub077:252894:252964 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub077:252894:252964 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub077:252894:252964 [2] NCCL INFO comm 0xc19a4b40 rank 54 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub032:3246892:3246892 [0] NCCL INFO cudaDriverVersion 12010 +gpub032:3246892:3246892 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3246892:3246892 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3246892:3246974 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3246892:3246974 [0] NCCL INFO Using network IB +gpub032:3246892:3246974 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub032:3246892:3246974 [0] NCCL INFO Trees [0] 29/-1/-1->28->24 [1] 29/12/-1->28->60 +gpub032:3246892:3246974 [0] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 00/0 : 28[7000] -> 29[46000] via P2P/IPC +gpub032:3246892:3246974 [0] NCCL INFO Channel 01/0 : 28[7000] -> 29[46000] via P2P/IPC +gpub032:3246892:3246974 [0] NCCL INFO Connected all rings +gpub032:3246892:3246974 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [receive] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [receive] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [receive] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [send] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [send] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [send] via NET/IB/0 +gpub032:3246892:3246974 [0] NCCL INFO Connected all trees +gpub032:3246892:3246974 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3246892:3246974 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3246892:3246974 [0] NCCL INFO comm 0x4ff3dba0 rank 28 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub079:2616804:2616804 [1] NCCL INFO cudaDriverVersion 12010 +gpub079:2616804:2616804 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2616804:2616804 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2616804:2616880 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2616804:2616880 [1] NCCL INFO Using network IB +gpub079:2616804:2616880 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub079:2616804:2616880 [1] NCCL INFO Trees [0] 58/52/-1->57->56 [1] 58/-1/-1->57->56 +gpub079:2616804:2616880 [1] NCCL INFO Channel 00/0 : 57[46000] -> 58[85000] via P2P/IPC +gpub079:2616804:2616880 [1] NCCL INFO Channel 01/0 : 57[46000] -> 58[85000] via P2P/IPC +gpub079:2616804:2616880 [1] NCCL INFO Connected all rings +gpub079:2616804:2616880 [1] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [receive] via NET/IB/0 +gpub079:2616804:2616880 [1] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [send] via NET/IB/0 +gpub079:2616804:2616880 [1] NCCL INFO Channel 00/0 : 57[46000] -> 56[7000] via P2P/IPC +gpub079:2616804:2616880 [1] NCCL INFO Channel 01/0 : 57[46000] -> 56[7000] via P2P/IPC +gpub079:2616804:2616880 [1] NCCL INFO Connected all trees +gpub079:2616804:2616880 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2616804:2616880 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2616804:2616880 [1] NCCL INFO comm 0x9014adc0 rank 57 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub079:2616805:2616805 [2] NCCL INFO cudaDriverVersion 12010 +gpub079:2616805:2616805 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2616805:2616805 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2616805:2616882 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2616805:2616882 [2] NCCL INFO Using network IB +gpub079:2616805:2616882 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub079:2616805:2616882 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->57 +gpub079:2616805:2616882 [2] NCCL INFO Channel 00/0 : 58[85000] -> 59[c7000] via P2P/IPC +gpub079:2616805:2616882 [2] NCCL INFO Channel 01/0 : 58[85000] -> 59[c7000] via P2P/IPC +gpub079:2616805:2616882 [2] NCCL INFO Connected all rings +gpub079:2616805:2616882 [2] NCCL INFO Channel 00/0 : 58[85000] -> 57[46000] via P2P/IPC +gpub079:2616805:2616882 [2] NCCL INFO Channel 01/0 : 58[85000] -> 57[46000] via P2P/IPC +gpub079:2616805:2616882 [2] NCCL INFO Connected all trees +gpub079:2616805:2616882 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2616805:2616882 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2616805:2616882 [2] NCCL INFO comm 0x8b2c9c20 rank 58 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub060:1938146:1938146 [3] NCCL INFO cudaDriverVersion 12010 +gpub060:1938146:1938146 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.160<0> +gpub060:1938146:1938146 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub060:1938146:1938220 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.160<0> +gpub060:1938146:1938220 [3] NCCL INFO Using network IB +gpub060:1938146:1938220 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub060:1938146:1938220 [3] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] -1/-1/-1->39->38 +gpub060:1938146:1938220 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0 +gpub060:1938146:1938220 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0 +gpub060:1938146:1938220 [3] NCCL INFO Connected all rings +gpub060:1938146:1938220 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 38[85000] via P2P/IPC +gpub060:1938146:1938220 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 38[85000] via P2P/IPC +gpub060:1938146:1938220 [3] NCCL INFO Connected all trees +gpub060:1938146:1938220 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub060:1938146:1938220 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub060:1938146:1938220 [3] NCCL INFO comm 0x50addeb0 rank 39 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub016:1380822:1380822 [1] NCCL INFO cudaDriverVersion 12010 +gpub016:1380822:1380822 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1380822:1380822 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1380822:1380898 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1380822:1380898 [1] NCCL INFO Using network IB +gpub016:1380822:1380898 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub016:1380822:1380898 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/20/-1->13->12 +gpub016:1380822:1380898 [1] NCCL INFO Channel 00/0 : 13[46000] -> 14[85000] via P2P/IPC +gpub016:1380822:1380898 [1] NCCL INFO Channel 01/0 : 13[46000] -> 14[85000] via P2P/IPC +gpub016:1380822:1380898 [1] NCCL INFO Connected all rings +gpub016:1380822:1380898 [1] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [send] via NET/IB/0 +gpub016:1380822:1380898 [1] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [receive] via NET/IB/0 +gpub016:1380822:1380898 [1] NCCL INFO Channel 00/0 : 13[46000] -> 12[7000] via P2P/IPC +gpub016:1380822:1380898 [1] NCCL INFO Channel 01/0 : 13[46000] -> 12[7000] via P2P/IPC +gpub016:1380822:1380898 [1] NCCL INFO Connected all trees +gpub016:1380822:1380898 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub016:1380822:1380898 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1380822:1380898 [1] NCCL INFO comm 0x9b8bb7a0 rank 13 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub066:1432047:1432047 [2] NCCL INFO cudaDriverVersion 12010 +gpub066:1432047:1432047 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1432047:1432047 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1432047:1432127 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1432047:1432127 [2] NCCL INFO Using network IB +gpub066:1432047:1432127 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub066:1432047:1432127 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->41 +gpub066:1432047:1432127 [2] NCCL INFO Channel 00/0 : 42[85000] -> 43[c7000] via P2P/IPC +gpub066:1432047:1432127 [2] NCCL INFO Channel 01/0 : 42[85000] -> 43[c7000] via P2P/IPC +gpub066:1432047:1432127 [2] NCCL INFO Connected all rings +gpub066:1432047:1432127 [2] NCCL INFO Channel 00/0 : 42[85000] -> 41[46000] via P2P/IPC +gpub066:1432047:1432127 [2] NCCL INFO Channel 01/0 : 42[85000] -> 41[46000] via P2P/IPC +gpub066:1432047:1432127 [2] NCCL INFO Connected all trees +gpub066:1432047:1432127 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub066:1432047:1432127 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1432047:1432127 [2] NCCL INFO comm 0x9ed0150 rank 42 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub030:2310658:2310658 [1] NCCL INFO cudaDriverVersion 12010 +gpub030:2310658:2310658 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.130<0> +gpub030:2310658:2310658 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub030:2310658:2310725 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.130<0> +gpub030:2310658:2310725 [1] NCCL INFO Using network IB +gpub030:2310658:2310725 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub030:2310658:2310725 [1] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/24/-1->21->20 +gpub030:2310658:2310725 [1] NCCL INFO Channel 00/0 : 21[46000] -> 22[85000] via P2P/IPC +gpub030:2310658:2310725 [1] NCCL INFO Channel 01/0 : 21[46000] -> 22[85000] via P2P/IPC +gpub030:2310658:2310725 [1] NCCL INFO Connected all rings +gpub030:2310658:2310725 [1] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [send] via NET/IB/0 +gpub030:2310658:2310725 [1] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [receive] via NET/IB/0 +gpub030:2310658:2310725 [1] NCCL INFO Channel 00/0 : 21[46000] -> 20[7000] via P2P/IPC +gpub030:2310658:2310725 [1] NCCL INFO Channel 01/0 : 21[46000] -> 20[7000] via P2P/IPC +gpub030:2310658:2310725 [1] NCCL INFO Connected all trees +gpub030:2310658:2310725 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub030:2310658:2310725 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub030:2310658:2310725 [1] NCCL INFO comm 0x50672d50 rank 21 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub059:1894383:1894383 [0] NCCL INFO cudaDriverVersion 12010 +gpub059:1894383:1894383 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1894383:1894383 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1894383:1894458 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1894383:1894458 [0] NCCL INFO Using network IB +gpub059:1894383:1894458 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub059:1894383:1894458 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->36 +gpub059:1894383:1894458 [0] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 00/0 : 32[7000] -> 33[46000] via P2P/IPC +gpub059:1894383:1894458 [0] NCCL INFO Channel 01/0 : 32[7000] -> 33[46000] via P2P/IPC +gpub059:1894383:1894458 [0] NCCL INFO Connected all rings +gpub059:1894383:1894458 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [send] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [send] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [receive] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [send] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [receive] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [receive] via NET/IB/0 +gpub059:1894383:1894458 [0] NCCL INFO Connected all trees +gpub059:1894383:1894458 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub059:1894383:1894458 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1894383:1894458 [0] NCCL INFO comm 0x510467d0 rank 32 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub059:1894386:1894386 [3] NCCL INFO cudaDriverVersion 12010 +gpub059:1894386:1894386 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1894386:1894386 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1894386:1894456 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1894386:1894456 [3] NCCL INFO Using network IB +gpub059:1894386:1894456 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub059:1894386:1894456 [3] NCCL INFO Trees [0] -1/-1/-1->35->34 [1] -1/-1/-1->35->34 +gpub059:1894386:1894456 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0 +gpub059:1894386:1894456 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0 +gpub059:1894386:1894456 [3] NCCL INFO Connected all rings +gpub059:1894386:1894456 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 34[85000] via P2P/IPC +gpub059:1894386:1894456 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 34[85000] via P2P/IPC +gpub059:1894386:1894456 [3] NCCL INFO Connected all trees +gpub059:1894386:1894456 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub059:1894386:1894456 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1894386:1894456 [3] NCCL INFO comm 0x9cf1390 rank 35 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub059:1894385:1894385 [2] NCCL INFO cudaDriverVersion 12010 +gpub059:1894385:1894385 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1894385:1894385 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1894385:1894457 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1894385:1894457 [2] NCCL INFO Using network IB +gpub059:1894385:1894457 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub059:1894385:1894457 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/-1/-1->34->33 +gpub059:1894385:1894457 [2] NCCL INFO Channel 00/0 : 34[85000] -> 35[c7000] via P2P/IPC +gpub059:1894385:1894457 [2] NCCL INFO Channel 01/0 : 34[85000] -> 35[c7000] via P2P/IPC +gpub059:1894385:1894457 [2] NCCL INFO Connected all rings +gpub059:1894385:1894457 [2] NCCL INFO Channel 00/0 : 34[85000] -> 33[46000] via P2P/IPC +gpub059:1894385:1894457 [2] NCCL INFO Channel 01/0 : 34[85000] -> 33[46000] via P2P/IPC +gpub059:1894385:1894457 [2] NCCL INFO Connected all trees +gpub059:1894385:1894457 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub059:1894385:1894457 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1894385:1894457 [2] NCCL INFO comm 0x50af3510 rank 34 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub030:2310660:2310660 [3] NCCL INFO cudaDriverVersion 12010 +gpub030:2310660:2310660 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.130<0> +gpub030:2310660:2310660 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub030:2310660:2310727 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.130<0> +gpub030:2310660:2310727 [3] NCCL INFO Using network IB +gpub030:2310660:2310727 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub030:2310660:2310727 [3] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] -1/-1/-1->23->22 +gpub030:2310660:2310727 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0 +gpub030:2310660:2310727 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0 +gpub030:2310660:2310727 [3] NCCL INFO Connected all rings +gpub030:2310660:2310727 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 22[85000] via P2P/IPC +gpub030:2310660:2310727 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 22[85000] via P2P/IPC +gpub030:2310660:2310727 [3] NCCL INFO Connected all trees +gpub030:2310660:2310727 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub030:2310660:2310727 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub030:2310660:2310727 [3] NCCL INFO comm 0xa84d3a10 rank 23 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub031:1878312:1878312 [1] NCCL INFO cudaDriverVersion 12010 +gpub031:1878312:1878312 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1878312:1878312 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1878312:1878390 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1878312:1878390 [1] NCCL INFO Using network IB +gpub031:1878312:1878390 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub031:1878312:1878390 [1] NCCL INFO Trees [0] 26/20/-1->25->24 [1] 26/-1/-1->25->24 +gpub031:1878312:1878390 [1] NCCL INFO Channel 00/0 : 25[46000] -> 26[85000] via P2P/IPC +gpub031:1878312:1878390 [1] NCCL INFO Channel 01/0 : 25[46000] -> 26[85000] via P2P/IPC +gpub031:1878312:1878390 [1] NCCL INFO Connected all rings +gpub031:1878312:1878390 [1] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [receive] via NET/IB/0 +gpub031:1878312:1878390 [1] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [send] via NET/IB/0 +gpub031:1878312:1878390 [1] NCCL INFO Channel 00/0 : 25[46000] -> 24[7000] via P2P/IPC +gpub031:1878312:1878390 [1] NCCL INFO Channel 01/0 : 25[46000] -> 24[7000] via P2P/IPC +gpub031:1878312:1878390 [1] NCCL INFO Connected all trees +gpub031:1878312:1878390 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1878312:1878390 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1878312:1878390 [1] NCCL INFO comm 0x509faf60 rank 25 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub032:3246895:3246895 [3] NCCL INFO cudaDriverVersion 12010 +gpub032:3246895:3246895 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3246895:3246895 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3246895:3246973 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3246895:3246973 [3] NCCL INFO Using network IB +gpub032:3246895:3246973 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub032:3246895:3246973 [3] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] -1/-1/-1->31->30 +gpub032:3246895:3246973 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0 +gpub032:3246895:3246973 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0 +gpub032:3246895:3246973 [3] NCCL INFO Connected all rings +gpub032:3246895:3246973 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 30[85000] via P2P/IPC +gpub032:3246895:3246973 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 30[85000] via P2P/IPC +gpub032:3246895:3246973 [3] NCCL INFO Connected all trees +gpub032:3246895:3246973 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3246895:3246973 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3246895:3246973 [3] NCCL INFO comm 0x1b5e5670 rank 31 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub077:252895:252895 [3] NCCL INFO cudaDriverVersion 12010 +gpub077:252895:252895 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.177<0> +gpub077:252895:252895 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub077:252895:252963 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.177<0> +gpub077:252895:252963 [3] NCCL INFO Using network IB +gpub077:252895:252963 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub077:252895:252963 [3] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] -1/-1/-1->55->54 +gpub077:252895:252963 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0 +gpub077:252895:252963 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0 +gpub077:252895:252963 [3] NCCL INFO Connected all rings +gpub077:252895:252963 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 54[85000] via P2P/IPC +gpub077:252895:252963 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 54[85000] via P2P/IPC +gpub077:252895:252963 [3] NCCL INFO Connected all trees +gpub077:252895:252963 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub077:252895:252963 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub077:252895:252963 [3] NCCL INFO comm 0x9491900 rank 55 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub031:1878311:1878311 [0] NCCL INFO cudaDriverVersion 12010 +gpub031:1878311:1878311 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1878311:1878311 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1878311:1878392 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1878311:1878392 [0] NCCL INFO Using network IB +gpub031:1878311:1878392 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub031:1878311:1878392 [0] NCCL INFO Trees [0] 25/28/-1->24->16 [1] 25/-1/-1->24->21 +gpub031:1878311:1878392 [0] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 00/0 : 24[7000] -> 25[46000] via P2P/IPC +gpub031:1878311:1878392 [0] NCCL INFO Channel 01/0 : 24[7000] -> 25[46000] via P2P/IPC +gpub031:1878311:1878392 [0] NCCL INFO Connected all rings +gpub031:1878311:1878392 [0] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [receive] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [send] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [receive] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [send] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [receive] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [send] via NET/IB/0 +gpub031:1878311:1878392 [0] NCCL INFO Connected all trees +gpub031:1878311:1878392 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1878311:1878392 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1878311:1878392 [0] NCCL INFO comm 0xba515710 rank 24 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub096:1440104:1440104 [3] NCCL INFO cudaDriverVersion 12010 +gpub096:1440104:1440104 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.196<0> +gpub096:1440104:1440104 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub096:1440104:1440176 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.196<0> +gpub096:1440104:1440176 [3] NCCL INFO Using network IB +gpub096:1440104:1440176 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub096:1440104:1440176 [3] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] -1/-1/-1->63->62 +gpub096:1440104:1440176 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [send] via NET/IB/0 +gpub096:1440104:1440176 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [send] via NET/IB/0 +gpub096:1440104:1440176 [3] NCCL INFO Connected all rings +gpub096:1440104:1440176 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 62[85000] via P2P/IPC +gpub096:1440104:1440176 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 62[85000] via P2P/IPC +gpub096:1440104:1440176 [3] NCCL INFO Connected all trees +gpub096:1440104:1440176 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub096:1440104:1440176 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub096:1440104:1440176 [3] NCCL INFO comm 0x9f265ce0 rank 63 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub096:1440103:1440103 [2] NCCL INFO cudaDriverVersion 12010 +gpub096:1440103:1440103 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.196<0> +gpub096:1440103:1440103 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub096:1440103:1440178 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.196<0> +gpub096:1440103:1440178 [2] NCCL INFO Using network IB +gpub096:1440103:1440178 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub096:1440103:1440178 [2] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 +gpub096:1440103:1440178 [2] NCCL INFO Channel 00/0 : 62[85000] -> 63[c7000] via P2P/IPC +gpub096:1440103:1440178 [2] NCCL INFO Channel 01/0 : 62[85000] -> 63[c7000] via P2P/IPC +gpub096:1440103:1440178 [2] NCCL INFO Connected all rings +gpub096:1440103:1440178 [2] NCCL INFO Channel 00/0 : 62[85000] -> 61[46000] via P2P/IPC +gpub096:1440103:1440178 [2] NCCL INFO Channel 01/0 : 62[85000] -> 61[46000] via P2P/IPC +gpub096:1440103:1440178 [2] NCCL INFO Connected all trees +gpub096:1440103:1440178 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub096:1440103:1440178 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub096:1440103:1440178 [2] NCCL INFO comm 0x91c6060 rank 62 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub096:1440101:1440101 [0] NCCL INFO cudaDriverVersion 12010 +gpub096:1440101:1440101 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.196<0> +gpub096:1440101:1440101 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub096:1440101:1440177 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.196<0> +gpub096:1440101:1440177 [0] NCCL INFO Using network IB +gpub096:1440101:1440177 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub096:1440101:1440177 [0] NCCL INFO Trees [0] 61/-1/-1->60->56 [1] 61/28/-1->60->-1 +gpub096:1440101:1440177 [0] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0 +gpub096:1440101:1440177 [0] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0 +gpub096:1440101:1440177 [0] NCCL INFO Channel 00/0 : 60[7000] -> 61[46000] via P2P/IPC +gpub096:1440101:1440177 [0] NCCL INFO Channel 01/0 : 60[7000] -> 61[46000] via P2P/IPC +gpub096:1440101:1440177 [0] NCCL INFO Connected all rings +gpub096:1440101:1440177 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [receive] via NET/IB/0 +gpub096:1440101:1440177 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [receive] via NET/IB/0 +gpub096:1440101:1440177 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [send] via NET/IB/0 +gpub096:1440101:1440177 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [send] via NET/IB/0 +gpub096:1440101:1440177 [0] NCCL INFO Connected all trees +gpub096:1440101:1440177 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub096:1440101:1440177 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub096:1440101:1440177 [0] NCCL INFO comm 0x50b020d0 rank 60 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub096:1440102:1440102 [1] NCCL INFO cudaDriverVersion 12010 +gpub096:1440102:1440102 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.196<0> +gpub096:1440102:1440102 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub096:1440102:1440179 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.196<0> +gpub096:1440102:1440179 [1] NCCL INFO Using network IB +gpub096:1440102:1440179 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub096:1440102:1440179 [1] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60 +gpub096:1440102:1440179 [1] NCCL INFO Channel 00/0 : 61[46000] -> 62[85000] via P2P/IPC +gpub096:1440102:1440179 [1] NCCL INFO Channel 01/0 : 61[46000] -> 62[85000] via P2P/IPC +gpub096:1440102:1440179 [1] NCCL INFO Connected all rings +gpub096:1440102:1440179 [1] NCCL INFO Channel 00/0 : 61[46000] -> 60[7000] via P2P/IPC +gpub096:1440102:1440179 [1] NCCL INFO Channel 01/0 : 61[46000] -> 60[7000] via P2P/IPC +gpub096:1440102:1440179 [1] NCCL INFO Connected all trees +gpub096:1440102:1440179 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub096:1440102:1440179 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub096:1440102:1440179 [1] NCCL INFO comm 0x50d96930 rank 61 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[gpub001:0/64] 2023-07-03 22:35:20,343 (trainer:732) INFO: 9epoch:train:1-100batch: iter_time=1.479, forward_time=0.254, loss_ctc=89.671, loss_att=65.418, acc=0.668, loss=72.694, backward_time=1.040, grad_norm=92.761, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.336e-04, train_time=8.067 +[gpub001:0/64] 2023-07-03 22:37:35,916 (trainer:732) INFO: 9epoch:train:101-200batch: iter_time=1.205e-04, forward_time=0.142, loss_ctc=77.074, loss_att=59.177, acc=0.644, loss=64.546, backward_time=1.024, grad_norm=98.231, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.335e-04, train_time=2.714 +[gpub001:0/64] 2023-07-03 22:39:51,736 (trainer:732) INFO: 9epoch:train:201-300batch: iter_time=1.193e-04, forward_time=0.141, loss_ctc=83.406, loss_att=66.282, acc=0.665, loss=71.419, backward_time=1.025, grad_norm=105.432, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.334e-04, train_time=2.716 +[gpub001:0/64] 2023-07-03 22:42:06,607 (trainer:732) INFO: 9epoch:train:301-400batch: iter_time=1.219e-04, forward_time=0.142, loss_ctc=70.602, loss_att=53.997, acc=0.653, loss=58.979, backward_time=1.022, grad_norm=85.786, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.333e-04, train_time=2.697 +[gpub001:0/64] 2023-07-03 22:44:49,865 (trainer:732) INFO: 9epoch:train:401-500batch: iter_time=1.219e-04, forward_time=0.142, loss_ctc=84.185, loss_att=67.677, acc=0.649, loss=72.629, backward_time=1.077, grad_norm=101.589, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.332e-04, train_time=3.265 +[gpub001:0/64] 2023-07-03 22:47:04,832 (trainer:732) INFO: 9epoch:train:501-600batch: iter_time=1.129e-04, forward_time=0.140, loss_ctc=77.007, loss_att=63.663, acc=0.649, loss=67.666, backward_time=1.021, grad_norm=119.077, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.331e-04, train_time=2.699 +[gpub001:0/64] 2023-07-03 22:49:20,582 (trainer:732) INFO: 9epoch:train:601-700batch: iter_time=1.158e-04, forward_time=0.142, loss_ctc=69.128, loss_att=50.937, acc=0.670, loss=56.394, backward_time=1.023, grad_norm=85.221, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.330e-04, train_time=2.715 +[gpub001:0/64] 2023-07-03 22:51:38,011 (trainer:732) INFO: 9epoch:train:701-800batch: iter_time=1.240e-04, forward_time=0.142, loss_ctc=87.200, loss_att=70.450, acc=0.661, loss=75.475, backward_time=1.023, grad_norm=107.449, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.329e-04, train_time=2.748 +[gpub001:0/64] 2023-07-03 22:54:40,138 (trainer:732) INFO: 9epoch:train:801-900batch: iter_time=1.192e-04, forward_time=0.142, loss_ctc=84.642, loss_att=62.734, acc=0.671, loss=69.306, backward_time=1.081, grad_norm=87.657, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.328e-04, train_time=3.642 +[gpub001:0/64] 2023-07-03 22:57:27,498 (trainer:732) INFO: 9epoch:train:901-1000batch: iter_time=1.098e-04, forward_time=0.141, loss_ctc=84.316, loss_att=59.445, acc=0.671, loss=66.906, backward_time=1.063, grad_norm=104.559, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.327e-04, train_time=3.347 +[gpub001:0/64] 2023-07-03 22:57:41,932 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/64] 2023-07-03 22:58:04,079 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-03 22:58:08,258 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-03 22:58:08,259 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/64] 2023-07-03 22:58:08,266 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-03 23:04:34,809 (trainer:732) INFO: 9epoch:train:1001-1100batch: iter_time=2.754, forward_time=0.180, loss_ctc=90.046, loss_att=65.751, acc=0.658, loss=73.039, backward_time=1.044, grad_norm=97.475, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.326e-04, train_time=8.546 +[gpub001:0/64] 2023-07-03 23:06:58,236 (trainer:732) INFO: 9epoch:train:1101-1200batch: iter_time=1.332e-04, forward_time=0.144, loss_ctc=77.169, loss_att=57.792, acc=0.640, loss=63.605, backward_time=1.031, grad_norm=84.645, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.325e-04, train_time=2.869 +[gpub001:0/64] 2023-07-03 23:09:20,632 (trainer:732) INFO: 9epoch:train:1201-1300batch: iter_time=1.286e-04, forward_time=0.144, loss_ctc=83.261, loss_att=65.116, acc=0.660, loss=70.559, backward_time=1.033, grad_norm=92.693, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.325e-04, train_time=2.848 +[gpub001:0/64] 2023-07-03 23:11:45,452 (trainer:732) INFO: 9epoch:train:1301-1400batch: iter_time=1.543e-04, forward_time=0.145, loss_ctc=69.216, loss_att=52.051, acc=0.651, loss=57.200, backward_time=1.030, grad_norm=85.186, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.324e-04, train_time=2.896 +[gpub001:0/64] 2023-07-03 23:14:11,669 (trainer:732) INFO: 9epoch:train:1401-1500batch: iter_time=1.219e-04, forward_time=0.145, loss_ctc=81.472, loss_att=65.882, acc=0.654, loss=70.559, backward_time=1.051, grad_norm=98.760, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.323e-04, train_time=2.924 +[gpub001:0/64] 2023-07-03 23:16:47,813 (trainer:732) INFO: 9epoch:train:1501-1600batch: iter_time=1.272e-04, forward_time=0.144, loss_ctc=75.580, loss_att=63.756, acc=0.646, loss=67.303, backward_time=1.053, grad_norm=90.582, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.322e-04, train_time=3.123 +[gpub001:0/64] 2023-07-03 23:19:18,487 (trainer:732) INFO: 9epoch:train:1601-1700batch: iter_time=1.166e-04, forward_time=0.144, loss_ctc=70.019, loss_att=51.653, acc=0.666, loss=57.163, backward_time=1.035, grad_norm=88.067, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.321e-04, train_time=3.013 +[gpub001:0/64] 2023-07-03 23:22:07,029 (trainer:732) INFO: 9epoch:train:1701-1800batch: iter_time=1.322e-04, forward_time=0.145, loss_ctc=83.559, loss_att=68.890, acc=0.660, loss=73.291, backward_time=1.058, grad_norm=93.644, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.182, optim0_lr0=1.320e-04, train_time=3.371 +[gpub001:0/64] 2023-07-03 23:24:43,521 (trainer:732) INFO: 9epoch:train:1801-1900batch: iter_time=1.216e-04, forward_time=0.146, loss_ctc=82.347, loss_att=62.197, acc=0.665, loss=68.242, backward_time=1.071, grad_norm=81.502, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.319e-04, train_time=3.130 +[gpub001:0/64] 2023-07-03 23:27:16,221 (trainer:732) INFO: 9epoch:train:1901-2000batch: iter_time=1.205e-04, forward_time=0.145, loss_ctc=79.689, loss_att=57.173, acc=0.675, loss=63.928, backward_time=1.053, grad_norm=110.108, clip=100.000, loss_scale=8.590e+09, optim_step_time=0.181, optim0_lr0=1.318e-04, train_time=3.054 +[gpub001:0/64] 2023-07-03 23:27:18,258 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/64] 2023-07-03 23:27:40,440 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-03 23:27:44,994 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-03 23:27:44,994 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/64] 2023-07-03 23:27:45,002 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-03 23:33:19,007 (trainer:732) INFO: 9epoch:train:2001-2100batch: iter_time=1.581, forward_time=0.193, loss_ctc=87.970, loss_att=63.692, acc=0.678, loss=70.975, backward_time=1.046, grad_norm=89.931, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.192, optim0_lr0=1.317e-04, train_time=7.255 +[gpub001:0/64] 2023-07-03 23:35:35,590 (trainer:732) INFO: 9epoch:train:2101-2200batch: iter_time=1.287e-04, forward_time=0.145, loss_ctc=76.465, loss_att=57.377, acc=0.649, loss=63.104, backward_time=1.025, grad_norm=85.685, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.316e-04, train_time=2.732 +[gpub001:0/64] 2023-07-03 23:37:51,391 (trainer:732) INFO: 9epoch:train:2201-2300batch: iter_time=1.298e-04, forward_time=0.144, loss_ctc=83.755, loss_att=67.648, acc=0.667, loss=72.480, backward_time=1.025, grad_norm=95.612, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.315e-04, train_time=2.716 +[gpub001:0/64] 2023-07-03 23:40:06,439 (trainer:732) INFO: 9epoch:train:2301-2400batch: iter_time=1.220e-04, forward_time=0.144, loss_ctc=67.251, loss_att=51.411, acc=0.664, loss=56.163, backward_time=1.021, grad_norm=84.349, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.314e-04, train_time=2.701 +[gpub001:0/64] 2023-07-03 23:42:25,815 (trainer:732) INFO: 9epoch:train:2401-2500batch: iter_time=1.184e-04, forward_time=0.145, loss_ctc=81.714, loss_att=66.031, acc=0.656, loss=70.736, backward_time=1.030, grad_norm=98.459, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.313e-04, train_time=2.787 +[gpub001:0/64] 2023-07-03 23:44:44,239 (trainer:732) INFO: 9epoch:train:2501-2600batch: iter_time=1.231e-04, forward_time=0.145, loss_ctc=74.569, loss_att=61.337, acc=0.654, loss=65.307, backward_time=1.031, grad_norm=86.312, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.313e-04, train_time=2.768 +[gpub001:0/64] 2023-07-03 23:47:09,040 (trainer:732) INFO: 9epoch:train:2601-2700batch: iter_time=1.325e-04, forward_time=0.144, loss_ctc=70.692, loss_att=50.739, acc=0.672, loss=56.725, backward_time=1.034, grad_norm=94.955, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.312e-04, train_time=2.896 +[gpub001:0/64] 2023-07-03 23:49:46,981 (trainer:732) INFO: 9epoch:train:2701-2800batch: iter_time=1.268e-04, forward_time=0.146, loss_ctc=82.425, loss_att=66.291, acc=0.670, loss=71.131, backward_time=1.050, grad_norm=96.178, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.311e-04, train_time=3.159 +[gpub001:0/64] 2023-07-03 23:52:17,933 (trainer:732) INFO: 9epoch:train:2801-2900batch: iter_time=1.165e-04, forward_time=0.144, loss_ctc=82.507, loss_att=61.707, acc=0.676, loss=67.947, backward_time=1.069, grad_norm=78.205, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.310e-04, train_time=3.019 +[gpub001:0/64] 2023-07-03 23:55:02,472 (trainer:732) INFO: 9epoch:train:2901-3000batch: iter_time=1.113e-04, forward_time=0.145, loss_ctc=80.597, loss_att=57.983, acc=0.678, loss=64.767, backward_time=1.050, grad_norm=96.704, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.309e-04, train_time=3.291 +[gpub001:0/64] 2023-07-03 23:55:22,500 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/64] 2023-07-03 23:55:45,123 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-03 23:55:49,354 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-03 23:55:49,354 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/64] 2023-07-03 23:55:49,431 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 00:00:56,009 (trainer:732) INFO: 9epoch:train:3001-3100batch: iter_time=2.040, forward_time=0.188, loss_ctc=87.894, loss_att=64.088, acc=0.666, loss=71.230, backward_time=1.042, grad_norm=89.421, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.184, optim0_lr0=1.308e-04, train_time=7.070 +[gpub001:0/64] 2023-07-04 00:03:24,637 (trainer:732) INFO: 9epoch:train:3101-3200batch: iter_time=7.802e-04, forward_time=0.237, loss_ctc=73.542, loss_att=54.936, acc=0.650, loss=60.518, backward_time=1.038, grad_norm=83.280, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.190, optim0_lr0=1.307e-04, train_time=2.973 +[gpub001:0/64] 2023-07-04 00:05:45,751 (trainer:732) INFO: 9epoch:train:3201-3300batch: iter_time=1.097e-04, forward_time=0.168, loss_ctc=83.071, loss_att=63.854, acc=0.666, loss=69.619, backward_time=1.029, grad_norm=84.872, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.183, optim0_lr0=1.306e-04, train_time=2.820 +[gpub001:0/64] 2023-07-04 00:08:15,073 (trainer:732) INFO: 9epoch:train:3301-3400batch: iter_time=2.763e-04, forward_time=0.240, loss_ctc=67.308, loss_att=51.039, acc=0.655, loss=55.919, backward_time=1.042, grad_norm=76.085, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.187, optim0_lr0=1.305e-04, train_time=2.988 +[gpub001:0/64] 2023-07-04 00:10:38,777 (trainer:732) INFO: 9epoch:train:3401-3500batch: iter_time=1.431e-04, forward_time=0.181, loss_ctc=81.458, loss_att=64.858, acc=0.657, loss=69.838, backward_time=1.035, grad_norm=85.868, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.185, optim0_lr0=1.305e-04, train_time=2.873 +[gpub001:0/64] 2023-07-04 00:13:07,268 (trainer:732) INFO: 9epoch:train:3501-3600batch: iter_time=4.291e-04, forward_time=0.234, loss_ctc=73.981, loss_att=61.944, acc=0.651, loss=65.555, backward_time=1.036, grad_norm=95.691, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.186, optim0_lr0=1.304e-04, train_time=2.970 +[gpub001:0/64] 2023-07-04 00:15:26,106 (trainer:732) INFO: 9epoch:train:3601-3700batch: iter_time=1.187e-04, forward_time=0.168, loss_ctc=68.552, loss_att=50.700, acc=0.671, loss=56.056, backward_time=1.026, grad_norm=80.833, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.303e-04, train_time=2.775 +[gpub001:0/64] 2023-07-04 00:18:11,878 (trainer:732) INFO: 9epoch:train:3701-3800batch: iter_time=5.008e-04, forward_time=0.250, loss_ctc=81.535, loss_att=67.538, acc=0.661, loss=71.737, backward_time=1.059, grad_norm=101.130, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.188, optim0_lr0=1.302e-04, train_time=3.316 +[gpub001:0/64] 2023-07-04 00:20:43,658 (trainer:732) INFO: 9epoch:train:3801-3900batch: iter_time=5.966e-04, forward_time=0.156, loss_ctc=82.350, loss_att=61.685, acc=0.669, loss=67.885, backward_time=1.048, grad_norm=98.009, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.301e-04, train_time=3.035 +[gpub001:0/64] 2023-07-04 00:23:35,253 (trainer:732) INFO: 9epoch:train:3901-4000batch: iter_time=1.404e-04, forward_time=0.240, loss_ctc=78.814, loss_att=56.564, acc=0.678, loss=63.239, backward_time=1.090, grad_norm=90.393, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.185, optim0_lr0=1.300e-04, train_time=3.431 +[gpub001:0/64] 2023-07-04 00:23:55,438 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/64] 2023-07-04 00:24:17,575 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 00:24:21,806 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 00:24:21,806 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/64] 2023-07-04 00:24:21,829 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 00:31:27,733 (trainer:732) INFO: 9epoch:train:4001-4100batch: iter_time=2.183, forward_time=0.215, loss_ctc=87.545, loss_att=63.219, acc=0.679, loss=70.517, backward_time=1.043, grad_norm=95.106, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.186, optim0_lr0=1.299e-04, train_time=9.450 +[gpub001:0/64] 2023-07-04 00:33:43,494 (trainer:732) INFO: 9epoch:train:4101-4200batch: iter_time=1.178e-04, forward_time=0.145, loss_ctc=74.551, loss_att=56.168, acc=0.656, loss=61.683, backward_time=1.024, grad_norm=92.157, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.298e-04, train_time=2.715 +[gpub001:0/64] 2023-07-04 00:35:59,666 (trainer:732) INFO: 9epoch:train:4201-4300batch: iter_time=1.157e-04, forward_time=0.145, loss_ctc=82.064, loss_att=63.472, acc=0.678, loss=69.050, backward_time=1.028, grad_norm=98.221, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.297e-04, train_time=2.723 +[gpub001:0/64] 2023-07-04 00:38:15,095 (trainer:732) INFO: 9epoch:train:4301-4400batch: iter_time=1.075e-04, forward_time=0.143, loss_ctc=68.892, loss_att=51.913, acc=0.663, loss=57.007, backward_time=1.023, grad_norm=82.794, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.297e-04, train_time=2.708 +[gpub001:0/64] 2023-07-04 00:40:30,833 (trainer:732) INFO: 9epoch:train:4401-4500batch: iter_time=1.207e-04, forward_time=0.145, loss_ctc=82.133, loss_att=65.286, acc=0.663, loss=70.340, backward_time=1.027, grad_norm=97.104, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.296e-04, train_time=2.715 +[gpub001:0/64] 2023-07-04 00:42:58,614 (trainer:732) INFO: 9epoch:train:4501-4600batch: iter_time=1.234e-04, forward_time=0.144, loss_ctc=73.526, loss_att=61.357, acc=0.659, loss=65.008, backward_time=1.042, grad_norm=89.909, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.295e-04, train_time=2.955 +[gpub001:0/64] 2023-07-04 00:45:13,932 (trainer:732) INFO: 9epoch:train:4601-4700batch: iter_time=1.179e-04, forward_time=0.144, loss_ctc=69.403, loss_att=50.034, acc=0.676, loss=55.845, backward_time=1.023, grad_norm=93.331, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.294e-04, train_time=2.706 +[gpub001:0/64] 2023-07-04 00:47:42,926 (trainer:732) INFO: 9epoch:train:4701-4800batch: iter_time=1.269e-04, forward_time=0.145, loss_ctc=81.878, loss_att=66.373, acc=0.671, loss=71.025, backward_time=1.048, grad_norm=98.786, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.293e-04, train_time=2.980 +[gpub001:0/64] 2023-07-04 00:50:21,085 (trainer:732) INFO: 9epoch:train:4801-4900batch: iter_time=1.140e-04, forward_time=0.146, loss_ctc=83.277, loss_att=60.906, acc=0.679, loss=67.617, backward_time=1.047, grad_norm=96.167, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.292e-04, train_time=3.163 +[gpub001:0/64] 2023-07-04 00:52:52,099 (trainer:732) INFO: 9epoch:train:4901-5000batch: iter_time=1.094e-04, forward_time=0.146, loss_ctc=79.400, loss_att=56.855, acc=0.683, loss=63.619, backward_time=1.051, grad_norm=92.534, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.181, optim0_lr0=1.291e-04, train_time=3.020 +[gpub001:0/64] 2023-07-04 00:53:12,127 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/64] 2023-07-04 00:53:34,575 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 00:53:38,834 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 00:53:38,834 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/64] 2023-07-04 00:53:38,842 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 00:59:54,062 (trainer:732) INFO: 9epoch:train:5001-5100batch: iter_time=1.788, forward_time=0.204, loss_ctc=86.818, loss_att=63.626, acc=0.669, loss=70.584, backward_time=1.040, grad_norm=93.196, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.183, optim0_lr0=1.291e-04, train_time=8.438 +[gpub001:0/64] 2023-07-04 01:02:10,587 (trainer:732) INFO: 9epoch:train:5101-5200batch: iter_time=1.492e-04, forward_time=0.146, loss_ctc=76.113, loss_att=56.301, acc=0.651, loss=62.245, backward_time=1.026, grad_norm=83.102, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.183, optim0_lr0=1.290e-04, train_time=2.732 +[gpub001:0/64] 2023-07-04 01:04:27,185 (trainer:732) INFO: 9epoch:train:5201-5300batch: iter_time=1.206e-04, forward_time=0.144, loss_ctc=80.303, loss_att=62.833, acc=0.671, loss=68.074, backward_time=1.028, grad_norm=89.660, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.289e-04, train_time=2.732 +[gpub001:0/64] 2023-07-04 01:06:45,184 (trainer:732) INFO: 9epoch:train:5301-5400batch: iter_time=1.284e-04, forward_time=0.144, loss_ctc=65.878, loss_att=49.861, acc=0.667, loss=54.666, backward_time=1.025, grad_norm=73.201, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.288e-04, train_time=2.760 +[gpub001:0/64] 2023-07-04 01:09:00,951 (trainer:732) INFO: 9epoch:train:5401-5500batch: iter_time=1.160e-04, forward_time=0.144, loss_ctc=80.403, loss_att=63.695, acc=0.661, loss=68.708, backward_time=1.023, grad_norm=99.950, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.287e-04, train_time=2.715 +[gpub001:0/64] 2023-07-04 01:11:27,791 (trainer:732) INFO: 9epoch:train:5501-5600batch: iter_time=1.300e-04, forward_time=0.144, loss_ctc=73.122, loss_att=62.649, acc=0.653, loss=65.791, backward_time=1.033, grad_norm=94.019, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.286e-04, train_time=2.937 +[gpub001:0/64] 2023-07-04 01:13:52,865 (trainer:732) INFO: 9epoch:train:5601-5700batch: iter_time=1.200e-04, forward_time=0.144, loss_ctc=67.757, loss_att=50.360, acc=0.677, loss=55.579, backward_time=1.034, grad_norm=81.435, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.183, optim0_lr0=1.285e-04, train_time=2.901 +[gpub001:0/64] 2023-07-04 01:16:15,288 (trainer:732) INFO: 9epoch:train:5701-5800batch: iter_time=1.206e-04, forward_time=0.145, loss_ctc=80.702, loss_att=67.135, acc=0.662, loss=71.205, backward_time=1.033, grad_norm=92.187, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.285e-04, train_time=2.848 +[gpub001:0/64] 2023-07-04 01:19:17,300 (trainer:732) INFO: 9epoch:train:5801-5900batch: iter_time=1.355e-04, forward_time=0.146, loss_ctc=81.556, loss_att=61.325, acc=0.670, loss=67.394, backward_time=1.087, grad_norm=98.823, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.284e-04, train_time=3.640 +[gpub001:0/64] 2023-07-04 01:21:54,934 (trainer:732) INFO: 9epoch:train:5901-6000batch: iter_time=1.163e-04, forward_time=0.143, loss_ctc=78.794, loss_att=57.019, acc=0.673, loss=63.552, backward_time=1.039, grad_norm=110.341, clip=100.000, loss_scale=1.718e+10, optim_step_time=0.182, optim0_lr0=1.283e-04, train_time=3.152 +[gpub001:0/64] 2023-07-04 01:22:12,906 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/64] 2023-07-04 01:22:35,275 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 01:22:39,453 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 01:22:39,453 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/64] 2023-07-04 01:22:39,563 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 01:27:58,379 (trainer:732) INFO: 9epoch:train:6001-6100batch: iter_time=2.080, forward_time=0.171, loss_ctc=85.395, loss_att=61.689, acc=0.672, loss=68.801, backward_time=1.043, grad_norm=87.997, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.184, optim0_lr0=1.282e-04, train_time=7.268 +[gpub001:0/64] 2023-07-04 01:30:14,639 (trainer:732) INFO: 9epoch:train:6101-6200batch: iter_time=1.186e-04, forward_time=0.144, loss_ctc=73.105, loss_att=54.251, acc=0.658, loss=59.907, backward_time=1.024, grad_norm=81.029, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.281e-04, train_time=2.725 +[gpub001:0/64] 2023-07-04 01:32:30,963 (trainer:732) INFO: 9epoch:train:6201-6300batch: iter_time=1.238e-04, forward_time=0.144, loss_ctc=82.012, loss_att=61.684, acc=0.672, loss=67.782, backward_time=1.027, grad_norm=92.039, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.280e-04, train_time=2.726 +[gpub001:0/64] 2023-07-04 01:34:46,436 (trainer:732) INFO: 9epoch:train:6301-6400batch: iter_time=1.161e-04, forward_time=0.144, loss_ctc=68.673, loss_att=51.531, acc=0.660, loss=56.674, backward_time=1.022, grad_norm=84.864, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.280e-04, train_time=2.709 +[gpub001:0/64] 2023-07-04 01:37:02,196 (trainer:732) INFO: 9epoch:train:6401-6500batch: iter_time=1.343e-04, forward_time=0.146, loss_ctc=80.427, loss_att=63.759, acc=0.665, loss=68.760, backward_time=1.026, grad_norm=91.094, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.279e-04, train_time=2.715 +[gpub001:0/64] 2023-07-04 01:39:23,012 (trainer:732) INFO: 9epoch:train:6501-6600batch: iter_time=1.238e-04, forward_time=0.145, loss_ctc=71.841, loss_att=62.207, acc=0.654, loss=65.098, backward_time=1.028, grad_norm=91.719, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.278e-04, train_time=2.816 +[gpub001:0/64] 2023-07-04 01:41:50,816 (trainer:732) INFO: 9epoch:train:6601-6700batch: iter_time=1.201e-04, forward_time=0.145, loss_ctc=67.160, loss_att=49.622, acc=0.677, loss=54.883, backward_time=1.040, grad_norm=85.772, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.277e-04, train_time=2.956 +[gpub001:0/64] 2023-07-04 01:44:34,992 (trainer:732) INFO: 9epoch:train:6701-6800batch: iter_time=1.203e-04, forward_time=0.146, loss_ctc=80.981, loss_att=67.197, acc=0.664, loss=71.332, backward_time=1.065, grad_norm=96.825, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.276e-04, train_time=3.283 +[gpub001:0/64] 2023-07-04 01:47:19,271 (trainer:732) INFO: 9epoch:train:6801-6900batch: iter_time=1.257e-04, forward_time=0.145, loss_ctc=82.167, loss_att=60.632, acc=0.674, loss=67.092, backward_time=1.082, grad_norm=84.474, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.275e-04, train_time=3.285 +[gpub001:0/64] 2023-07-04 01:50:08,231 (trainer:732) INFO: 9epoch:train:6901-7000batch: iter_time=1.171e-04, forward_time=0.143, loss_ctc=79.723, loss_att=57.262, acc=0.677, loss=64.000, backward_time=1.133, grad_norm=106.009, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.275e-04, train_time=3.379 +[gpub001:0/64] 2023-07-04 01:50:26,658 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/64] 2023-07-04 01:50:49,043 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 01:50:53,273 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 01:50:53,273 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/64] 2023-07-04 01:50:53,280 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 01:56:44,529 (trainer:732) INFO: 9epoch:train:7001-7100batch: iter_time=1.794, forward_time=0.205, loss_ctc=85.101, loss_att=63.351, acc=0.681, loss=69.876, backward_time=1.054, grad_norm=94.225, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.274e-04, train_time=7.925 +[gpub001:0/64] 2023-07-04 01:59:09,417 (trainer:732) INFO: 9epoch:train:7101-7200batch: iter_time=1.088e-04, forward_time=0.145, loss_ctc=73.666, loss_att=55.956, acc=0.661, loss=61.269, backward_time=1.036, grad_norm=80.604, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.181, optim0_lr0=1.273e-04, train_time=2.898 +[gpub001:0/64] 2023-07-04 02:01:39,261 (trainer:732) INFO: 9epoch:train:7201-7300batch: iter_time=1.157e-04, forward_time=0.144, loss_ctc=81.381, loss_att=63.231, acc=0.677, loss=68.676, backward_time=1.040, grad_norm=84.281, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.181, optim0_lr0=1.272e-04, train_time=2.997 +[gpub001:0/64] 2023-07-04 02:03:59,902 (trainer:732) INFO: 9epoch:train:7301-7400batch: iter_time=1.094e-04, forward_time=0.144, loss_ctc=66.904, loss_att=50.560, acc=0.673, loss=55.463, backward_time=1.028, grad_norm=89.574, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.271e-04, train_time=2.813 +[gpub001:0/64] 2023-07-04 02:06:31,880 (trainer:732) INFO: 9epoch:train:7401-7500batch: iter_time=1.110e-04, forward_time=0.145, loss_ctc=79.781, loss_att=64.132, acc=0.667, loss=68.827, backward_time=1.043, grad_norm=81.963, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.270e-04, train_time=3.039 +[gpub001:0/64] 2023-07-04 02:09:19,491 (trainer:732) INFO: 9epoch:train:7501-7600batch: iter_time=1.151e-04, forward_time=0.143, loss_ctc=74.243, loss_att=62.088, acc=0.657, loss=65.734, backward_time=1.102, grad_norm=86.931, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.270e-04, train_time=3.352 +[gpub001:0/64] 2023-07-04 02:11:55,587 (trainer:732) INFO: 9epoch:train:7601-7700batch: iter_time=1.161e-04, forward_time=0.142, loss_ctc=67.305, loss_att=48.568, acc=0.681, loss=54.189, backward_time=1.068, grad_norm=80.772, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.181, optim0_lr0=1.269e-04, train_time=3.122 +[gpub001:0/64] 2023-07-04 02:14:33,924 (trainer:732) INFO: 9epoch:train:7701-7800batch: iter_time=1.073e-04, forward_time=0.143, loss_ctc=81.014, loss_att=66.986, acc=0.670, loss=71.195, backward_time=1.054, grad_norm=87.112, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.181, optim0_lr0=1.268e-04, train_time=3.166 +[gpub001:0/64] 2023-07-04 02:17:40,732 (trainer:732) INFO: 9epoch:train:7801-7900batch: iter_time=1.189e-04, forward_time=0.144, loss_ctc=81.461, loss_att=60.824, acc=0.683, loss=67.015, backward_time=1.088, grad_norm=88.042, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.181, optim0_lr0=1.267e-04, train_time=3.736 +[gpub001:0/64] 2023-07-04 02:20:31,309 (trainer:732) INFO: 9epoch:train:7901-8000batch: iter_time=1.118e-04, forward_time=0.143, loss_ctc=77.800, loss_att=55.616, acc=0.685, loss=62.271, backward_time=1.068, grad_norm=96.283, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.181, optim0_lr0=1.266e-04, train_time=3.411 +[gpub001:0/64] 2023-07-04 02:20:50,628 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/64] 2023-07-04 02:21:12,925 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 02:21:17,185 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 02:21:17,185 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/64] 2023-07-04 02:21:17,193 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 02:27:33,747 (trainer:732) INFO: 9epoch:train:8001-8100batch: iter_time=2.310, forward_time=0.216, loss_ctc=86.868, loss_att=62.477, acc=0.682, loss=69.795, backward_time=1.066, grad_norm=93.604, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.186, optim0_lr0=1.265e-04, train_time=8.448 +[gpub001:0/64] 2023-07-04 02:30:47,915 (trainer:732) INFO: 9epoch:train:8101-8200batch: iter_time=1.102e-04, forward_time=0.144, loss_ctc=72.041, loss_att=54.309, acc=0.663, loss=59.628, backward_time=1.122, grad_norm=85.541, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.265e-04, train_time=3.884 +[gpub001:0/64] 2023-07-04 02:34:23,550 (trainer:732) INFO: 9epoch:train:8201-8300batch: iter_time=1.110e-04, forward_time=0.143, loss_ctc=80.318, loss_att=62.685, acc=0.682, loss=67.975, backward_time=1.191, grad_norm=91.389, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.264e-04, train_time=4.312 +[gpub001:0/64] 2023-07-04 02:38:10,352 (trainer:732) INFO: 9epoch:train:8301-8400batch: iter_time=1.088e-04, forward_time=0.143, loss_ctc=66.182, loss_att=50.205, acc=0.672, loss=54.998, backward_time=1.197, grad_norm=84.213, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.263e-04, train_time=4.536 +[gpub001:0/64] 2023-07-04 02:41:23,634 (trainer:732) INFO: 9epoch:train:8401-8500batch: iter_time=1.048e-04, forward_time=0.144, loss_ctc=80.035, loss_att=64.016, acc=0.668, loss=68.822, backward_time=1.096, grad_norm=82.939, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.262e-04, train_time=3.865 +[gpub001:0/64] 2023-07-04 02:45:08,691 (trainer:732) INFO: 9epoch:train:8501-8600batch: iter_time=1.245e-04, forward_time=0.145, loss_ctc=72.274, loss_att=60.192, acc=0.664, loss=63.816, backward_time=1.123, grad_norm=86.066, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.261e-04, train_time=4.501 +[gpub001:0/64] 2023-07-04 02:48:18,435 (trainer:732) INFO: 9epoch:train:8601-8700batch: iter_time=1.254e-04, forward_time=0.146, loss_ctc=66.300, loss_att=48.163, acc=0.686, loss=53.604, backward_time=1.120, grad_norm=78.350, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.261e-04, train_time=3.795 +[gpub001:0/64] 2023-07-04 02:51:37,339 (trainer:732) INFO: 9epoch:train:8701-8800batch: iter_time=1.333e-04, forward_time=0.146, loss_ctc=80.163, loss_att=63.912, acc=0.678, loss=68.787, backward_time=1.095, grad_norm=86.030, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.260e-04, train_time=3.978 +[gpub001:0/64] 2023-07-04 02:54:31,632 (trainer:732) INFO: 9epoch:train:8801-8900batch: iter_time=1.137e-04, forward_time=0.145, loss_ctc=81.677, loss_att=59.992, acc=0.684, loss=66.498, backward_time=1.097, grad_norm=75.414, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.259e-04, train_time=3.486 +[gpub001:0/64] 2023-07-04 02:57:46,255 (trainer:732) INFO: 9epoch:train:8901-9000batch: iter_time=1.238e-04, forward_time=0.145, loss_ctc=76.679, loss_att=55.296, acc=0.689, loss=61.711, backward_time=1.165, grad_norm=99.487, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.258e-04, train_time=3.892 +[gpub001:0/64] 2023-07-04 02:58:06,283 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/64] 2023-07-04 02:58:29,174 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 02:58:33,478 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 02:58:33,478 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/64] 2023-07-04 02:58:33,485 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 03:05:00,125 (trainer:732) INFO: 9epoch:train:9001-9100batch: iter_time=1.878, forward_time=0.187, loss_ctc=86.043, loss_att=63.578, acc=0.668, loss=70.318, backward_time=1.044, grad_norm=92.448, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.184, optim0_lr0=1.257e-04, train_time=8.677 +[gpub001:0/64] 2023-07-04 03:07:20,404 (trainer:732) INFO: 9epoch:train:9101-9200batch: iter_time=1.129e-04, forward_time=0.144, loss_ctc=73.663, loss_att=55.071, acc=0.654, loss=60.649, backward_time=1.032, grad_norm=88.403, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.257e-04, train_time=2.806 +[gpub001:0/64] 2023-07-04 03:09:37,950 (trainer:732) INFO: 9epoch:train:9201-9300batch: iter_time=1.218e-04, forward_time=0.143, loss_ctc=80.458, loss_att=62.287, acc=0.676, loss=67.738, backward_time=1.027, grad_norm=92.966, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.256e-04, train_time=2.751 +[gpub001:0/64] 2023-07-04 03:11:53,135 (trainer:732) INFO: 9epoch:train:9301-9400batch: iter_time=1.238e-04, forward_time=0.144, loss_ctc=66.752, loss_att=50.047, acc=0.665, loss=55.059, backward_time=1.022, grad_norm=85.205, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.255e-04, train_time=2.703 +[gpub001:0/64] 2023-07-04 03:14:19,601 (trainer:732) INFO: 9epoch:train:9401-9500batch: iter_time=1.238e-04, forward_time=0.144, loss_ctc=80.260, loss_att=63.174, acc=0.665, loss=68.300, backward_time=1.039, grad_norm=86.691, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.254e-04, train_time=2.929 +[gpub001:0/64] 2023-07-04 03:17:07,879 (trainer:732) INFO: 9epoch:train:9501-9600batch: iter_time=1.143e-04, forward_time=0.150, loss_ctc=73.284, loss_att=61.761, acc=0.657, loss=65.218, backward_time=1.062, grad_norm=97.161, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.254e-04, train_time=3.365 +[gpub001:0/64] 2023-07-04 03:19:34,610 (trainer:732) INFO: 9epoch:train:9601-9700batch: iter_time=5.961e-04, forward_time=0.161, loss_ctc=66.042, loss_att=48.731, acc=0.682, loss=53.924, backward_time=1.039, grad_norm=76.636, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.253e-04, train_time=2.934 +[gpub001:0/64] 2023-07-04 03:21:58,061 (trainer:732) INFO: 9epoch:train:9701-9800batch: iter_time=1.166e-04, forward_time=0.175, loss_ctc=78.654, loss_att=65.270, acc=0.669, loss=69.285, backward_time=1.036, grad_norm=89.956, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.184, optim0_lr0=1.252e-04, train_time=2.869 +[gpub001:0/64] 2023-07-04 03:24:35,779 (trainer:732) INFO: 9epoch:train:9801-9900batch: iter_time=1.327e-04, forward_time=0.166, loss_ctc=81.981, loss_att=61.082, acc=0.676, loss=67.352, backward_time=1.044, grad_norm=85.085, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.251e-04, train_time=3.154 +[gpub001:0/64] 2023-07-04 03:27:16,735 (trainer:732) INFO: 9epoch:train:9901-10000batch: iter_time=1.079e-04, forward_time=0.171, loss_ctc=77.652, loss_att=56.185, acc=0.682, loss=62.625, backward_time=1.047, grad_norm=92.946, clip=100.000, loss_scale=3.436e+10, optim_step_time=0.182, optim0_lr0=1.250e-04, train_time=3.219 +[gpub001:0/64] 2023-07-04 03:40:25,652 (trainer:338) INFO: 9epoch results: [train] iter_time=0.199, forward_time=0.157, loss_ctc=77.876, loss_att=59.714, acc=0.667, loss=65.163, backward_time=1.051, grad_norm=90.766, clip=100.000, loss_scale=2.233e+10, optim_step_time=0.182, optim0_lr0=1.292e-04, train_time=3.584, time=4 hours, 59 minutes and 3.18 seconds, total_count=60000, gpu_max_cached_mem_GB=34.164, [valid] loss_ctc=58.837, cer_ctc=0.322, loss_att=48.196, acc=0.608, cer=0.461, wer=0.998, loss=51.388, time=6 minutes and 52.63 seconds, total_count=6578, gpu_max_cached_mem_GB=37.459, [att_plot] time=5 minutes and 53.06 seconds, total_count=0, gpu_max_cached_mem_GB=37.459 +[gpub001:0/64] 2023-07-04 03:40:45,004 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub001:0/64] 2023-07-04 03:40:45,009 (trainer:440) INFO: The model files were removed: exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/4epoch.pth +[gpub001:0/64] 2023-07-04 03:40:45,077 (trainer:272) INFO: 10/100epoch started. Estimated time to finish: 2 weeks, 5 days and 17 hours +[gpub001:0/64] 2023-07-04 03:40:46,338 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/64] 2023-07-04 03:41:10,884 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 03:41:15,265 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 03:41:15,265 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/64] 2023-07-04 03:41:15,312 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 03:52:28,581 (trainer:732) INFO: 10epoch:train:1-100batch: iter_time=5.563, forward_time=0.206, loss_ctc=74.661, loss_att=61.707, acc=0.668, loss=65.594, backward_time=1.045, grad_norm=92.048, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.187, optim0_lr0=1.250e-04, train_time=14.055 +[gpub001:0/64] 2023-07-04 03:54:52,229 (trainer:732) INFO: 10epoch:train:101-200batch: iter_time=1.120e-04, forward_time=0.145, loss_ctc=79.571, loss_att=57.713, acc=0.649, loss=64.270, backward_time=1.039, grad_norm=111.566, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.249e-04, train_time=2.875 +[gpub001:0/64] 2023-07-04 03:57:21,049 (trainer:732) INFO: 10epoch:train:201-300batch: iter_time=1.163e-04, forward_time=0.145, loss_ctc=83.086, loss_att=62.207, acc=0.668, loss=68.471, backward_time=1.041, grad_norm=91.324, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.248e-04, train_time=2.976 +[gpub001:0/64] 2023-07-04 04:00:00,041 (trainer:732) INFO: 10epoch:train:301-400batch: iter_time=0.002, forward_time=0.201, loss_ctc=95.823, loss_att=89.716, acc=0.632, loss=91.548, backward_time=1.115, grad_norm=100.695, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.189, optim0_lr0=1.247e-04, train_time=3.179 +[gpub001:0/64] 2023-07-04 04:02:29,261 (trainer:732) INFO: 10epoch:train:401-500batch: iter_time=1.225e-04, forward_time=0.146, loss_ctc=85.114, loss_att=65.608, acc=0.636, loss=71.460, backward_time=1.044, grad_norm=99.338, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.246e-04, train_time=2.984 +[gpub001:0/64] 2023-07-04 04:05:16,066 (trainer:732) INFO: 10epoch:train:501-600batch: iter_time=1.073e-04, forward_time=0.145, loss_ctc=82.793, loss_att=59.421, acc=0.686, loss=66.432, backward_time=1.067, grad_norm=93.603, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.246e-04, train_time=3.336 +[gpub001:0/64] 2023-07-04 04:08:01,476 (trainer:732) INFO: 10epoch:train:601-700batch: iter_time=1.187e-04, forward_time=0.146, loss_ctc=80.618, loss_att=66.338, acc=0.662, loss=70.622, backward_time=1.058, grad_norm=93.226, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.245e-04, train_time=3.308 +[gpub001:0/64] 2023-07-04 04:10:30,382 (trainer:732) INFO: 10epoch:train:701-800batch: iter_time=1.108e-04, forward_time=0.146, loss_ctc=68.601, loss_att=54.680, acc=0.656, loss=58.856, backward_time=1.044, grad_norm=79.552, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.244e-04, train_time=2.978 +[gpub001:0/64] 2023-07-04 04:12:53,237 (trainer:732) INFO: 10epoch:train:801-900batch: iter_time=1.028e-04, forward_time=0.145, loss_ctc=85.022, loss_att=59.685, acc=0.652, loss=67.286, backward_time=1.036, grad_norm=92.734, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.243e-04, train_time=2.857 +[gpub001:0/64] 2023-07-04 04:15:35,870 (trainer:732) INFO: 10epoch:train:901-1000batch: iter_time=1.063e-04, forward_time=0.145, loss_ctc=78.810, loss_att=62.834, acc=0.664, loss=67.627, backward_time=1.081, grad_norm=81.075, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.243e-04, train_time=3.252 +[gpub001:0/64] 2023-07-04 04:15:49,814 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/64] 2023-07-04 04:16:11,858 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 04:16:16,315 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 04:16:16,315 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/64] 2023-07-04 04:16:16,322 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 04:22:44,047 (trainer:732) INFO: 10epoch:train:1001-1100batch: iter_time=2.290, forward_time=0.192, loss_ctc=71.839, loss_att=58.508, acc=0.676, loss=62.507, backward_time=1.050, grad_norm=82.985, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.185, optim0_lr0=1.242e-04, train_time=8.563 +[gpub001:0/64] 2023-07-04 04:25:13,405 (trainer:732) INFO: 10epoch:train:1101-1200batch: iter_time=1.072e-04, forward_time=0.144, loss_ctc=77.842, loss_att=57.499, acc=0.651, loss=63.602, backward_time=1.049, grad_norm=92.682, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.241e-04, train_time=2.987 +[gpub001:0/64] 2023-07-04 04:28:16,167 (trainer:732) INFO: 10epoch:train:1201-1300batch: iter_time=1.018e-04, forward_time=0.146, loss_ctc=79.271, loss_att=59.359, acc=0.677, loss=65.332, backward_time=1.078, grad_norm=81.278, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.240e-04, train_time=3.655 +[gpub001:0/64] 2023-07-04 04:30:50,171 (trainer:732) INFO: 10epoch:train:1301-1400batch: iter_time=1.297e-04, forward_time=0.145, loss_ctc=93.983, loss_att=86.915, acc=0.636, loss=89.035, backward_time=1.054, grad_norm=96.846, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.240e-04, train_time=3.080 +[gpub001:0/64] 2023-07-04 04:33:24,726 (trainer:732) INFO: 10epoch:train:1401-1500batch: iter_time=1.176e-04, forward_time=0.145, loss_ctc=82.254, loss_att=62.424, acc=0.642, loss=68.373, backward_time=1.061, grad_norm=95.559, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.239e-04, train_time=3.091 +[gpub001:0/64] 2023-07-04 04:36:06,544 (trainer:732) INFO: 10epoch:train:1501-1600batch: iter_time=1.104e-04, forward_time=0.143, loss_ctc=83.817, loss_att=59.295, acc=0.686, loss=66.652, backward_time=1.055, grad_norm=85.857, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.238e-04, train_time=3.236 +[gpub001:0/64] 2023-07-04 04:38:46,522 (trainer:732) INFO: 10epoch:train:1601-1700batch: iter_time=9.965e-05, forward_time=0.145, loss_ctc=80.193, loss_att=65.391, acc=0.664, loss=69.831, backward_time=1.061, grad_norm=85.978, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.237e-04, train_time=3.199 +[gpub001:0/64] 2023-07-04 04:41:26,350 (trainer:732) INFO: 10epoch:train:1701-1800batch: iter_time=1.123e-04, forward_time=0.144, loss_ctc=70.011, loss_att=53.793, acc=0.662, loss=58.658, backward_time=1.055, grad_norm=85.885, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.237e-04, train_time=3.196 +[gpub001:0/64] 2023-07-04 04:44:09,017 (trainer:732) INFO: 10epoch:train:1801-1900batch: iter_time=2.201e-04, forward_time=0.181, loss_ctc=83.456, loss_att=59.210, acc=0.658, loss=66.484, backward_time=1.066, grad_norm=92.746, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.187, optim0_lr0=1.236e-04, train_time=3.253 +[gpub001:0/64] 2023-07-04 04:46:37,714 (trainer:732) INFO: 10epoch:train:1901-2000batch: iter_time=1.199e-04, forward_time=0.183, loss_ctc=77.646, loss_att=63.155, acc=0.668, loss=67.502, backward_time=1.048, grad_norm=81.185, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.186, optim0_lr0=1.235e-04, train_time=2.974 +[gpub001:0/64] 2023-07-04 04:46:55,557 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/64] 2023-07-04 04:47:17,723 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 04:47:22,197 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 04:47:22,197 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/64] 2023-07-04 04:47:22,204 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 04:52:58,643 (trainer:732) INFO: 10epoch:train:2001-2100batch: iter_time=2.218, forward_time=0.151, loss_ctc=71.055, loss_att=59.286, acc=0.684, loss=62.817, backward_time=1.049, grad_norm=84.270, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.234e-04, train_time=7.619 +[gpub001:0/64] 2023-07-04 04:55:14,620 (trainer:732) INFO: 10epoch:train:2101-2200batch: iter_time=1.359e-04, forward_time=0.145, loss_ctc=77.590, loss_att=57.580, acc=0.661, loss=63.583, backward_time=1.027, grad_norm=85.515, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.234e-04, train_time=2.719 +[gpub001:0/64] 2023-07-04 04:57:30,708 (trainer:732) INFO: 10epoch:train:2201-2300batch: iter_time=1.257e-04, forward_time=0.147, loss_ctc=80.305, loss_att=59.243, acc=0.682, loss=65.561, backward_time=1.029, grad_norm=96.055, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.233e-04, train_time=2.722 +[gpub001:0/64] 2023-07-04 05:00:02,444 (trainer:732) INFO: 10epoch:train:2301-2400batch: iter_time=1.220e-04, forward_time=0.148, loss_ctc=91.760, loss_att=85.779, acc=0.656, loss=87.573, backward_time=1.057, grad_norm=104.082, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.232e-04, train_time=3.034 +[gpub001:0/64] 2023-07-04 05:02:21,395 (trainer:732) INFO: 10epoch:train:2401-2500batch: iter_time=1.104e-04, forward_time=0.145, loss_ctc=82.408, loss_att=62.987, acc=0.655, loss=68.813, backward_time=1.033, grad_norm=110.637, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.231e-04, train_time=2.779 +[gpub001:0/64] 2023-07-04 05:04:43,190 (trainer:732) INFO: 10epoch:train:2501-2600batch: iter_time=1.250e-04, forward_time=0.146, loss_ctc=81.831, loss_att=58.991, acc=0.696, loss=65.843, backward_time=1.038, grad_norm=132.918, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.231e-04, train_time=2.836 +[gpub001:0/64] 2023-07-04 05:07:37,477 (trainer:732) INFO: 10epoch:train:2601-2700batch: iter_time=1.360e-04, forward_time=0.146, loss_ctc=79.920, loss_att=63.924, acc=0.673, loss=68.723, backward_time=1.119, grad_norm=92.002, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.230e-04, train_time=3.486 +[gpub001:0/64] 2023-07-04 05:10:26,668 (trainer:732) INFO: 10epoch:train:2701-2800batch: iter_time=1.242e-04, forward_time=0.145, loss_ctc=68.184, loss_att=53.646, acc=0.669, loss=58.007, backward_time=1.089, grad_norm=101.305, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.229e-04, train_time=3.384 +[gpub001:0/64] 2023-07-04 05:13:12,189 (trainer:732) INFO: 10epoch:train:2801-2900batch: iter_time=1.254e-04, forward_time=0.147, loss_ctc=82.436, loss_att=57.796, acc=0.672, loss=65.188, backward_time=1.060, grad_norm=88.713, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.228e-04, train_time=3.310 +[gpub001:0/64] 2023-07-04 05:15:42,484 (trainer:732) INFO: 10epoch:train:2901-3000batch: iter_time=1.035e-04, forward_time=0.146, loss_ctc=77.301, loss_att=60.873, acc=0.678, loss=65.801, backward_time=1.049, grad_norm=93.069, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.228e-04, train_time=3.006 +[gpub001:0/64] 2023-07-04 05:15:44,046 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/64] 2023-07-04 05:16:06,754 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 05:16:11,272 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 05:16:11,272 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/64] 2023-07-04 05:16:11,280 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 05:23:59,629 (trainer:732) INFO: 10epoch:train:3001-3100batch: iter_time=1.599, forward_time=0.223, loss_ctc=72.196, loss_att=59.196, acc=0.672, loss=63.096, backward_time=1.053, grad_norm=81.648, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.187, optim0_lr0=1.227e-04, train_time=9.943 +[gpub001:0/64] 2023-07-04 05:26:40,677 (trainer:732) INFO: 10epoch:train:3101-3200batch: iter_time=1.388e-04, forward_time=0.145, loss_ctc=75.881, loss_att=56.536, acc=0.656, loss=62.340, backward_time=1.067, grad_norm=82.455, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.226e-04, train_time=3.221 +[gpub001:0/64] 2023-07-04 05:29:15,619 (trainer:732) INFO: 10epoch:train:3201-3300batch: iter_time=1.326e-04, forward_time=0.147, loss_ctc=79.697, loss_att=58.130, acc=0.679, loss=64.600, backward_time=1.053, grad_norm=94.457, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.225e-04, train_time=3.099 +[gpub001:0/64] 2023-07-04 05:31:55,579 (trainer:732) INFO: 10epoch:train:3301-3400batch: iter_time=1.271e-04, forward_time=0.146, loss_ctc=91.505, loss_att=85.159, acc=0.645, loss=87.063, backward_time=1.071, grad_norm=111.923, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.225e-04, train_time=3.199 +[gpub001:0/64] 2023-07-04 05:35:09,355 (trainer:732) INFO: 10epoch:train:3401-3500batch: iter_time=1.234e-04, forward_time=0.146, loss_ctc=80.885, loss_att=61.538, acc=0.648, loss=67.342, backward_time=1.126, grad_norm=93.020, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.224e-04, train_time=3.875 +[gpub001:0/64] 2023-07-04 05:38:07,869 (trainer:732) INFO: 10epoch:train:3501-3600batch: iter_time=1.136e-04, forward_time=0.147, loss_ctc=79.992, loss_att=57.211, acc=0.695, loss=64.045, backward_time=1.084, grad_norm=91.127, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.223e-04, train_time=3.570 +[gpub001:0/64] 2023-07-04 05:41:06,528 (trainer:732) INFO: 10epoch:train:3601-3700batch: iter_time=1.135e-04, forward_time=0.145, loss_ctc=78.588, loss_att=63.951, acc=0.669, loss=68.342, backward_time=1.080, grad_norm=95.189, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.222e-04, train_time=3.573 +[gpub001:0/64] 2023-07-04 05:43:41,455 (trainer:732) INFO: 10epoch:train:3701-3800batch: iter_time=1.262e-04, forward_time=0.146, loss_ctc=67.525, loss_att=52.975, acc=0.664, loss=57.340, backward_time=1.047, grad_norm=79.137, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.183, optim0_lr0=1.222e-04, train_time=3.098 +[gpub001:0/64] 2023-07-04 05:46:38,398 (trainer:732) INFO: 10epoch:train:3801-3900batch: iter_time=1.189e-04, forward_time=0.146, loss_ctc=83.184, loss_att=57.702, acc=0.661, loss=65.347, backward_time=1.081, grad_norm=85.750, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.184, optim0_lr0=1.221e-04, train_time=3.539 +[gpub001:0/64] 2023-07-04 05:49:41,596 (trainer:732) INFO: 10epoch:train:3901-4000batch: iter_time=7.671e-04, forward_time=0.232, loss_ctc=78.322, loss_att=62.743, acc=0.672, loss=67.416, backward_time=1.090, grad_norm=78.715, clip=100.000, loss_scale=6.872e+10, optim_step_time=0.189, optim0_lr0=1.220e-04, train_time=3.664 +[gpub001:0/64] 2023-07-04 05:49:54,817 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/64] 2023-07-04 05:50:17,125 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 05:50:21,327 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 05:50:21,327 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/64] 2023-07-04 05:50:21,337 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 05:57:30,927 (trainer:732) INFO: 10epoch:train:4001-4100batch: iter_time=2.608, forward_time=0.189, loss_ctc=70.837, loss_att=58.363, acc=0.687, loss=62.105, backward_time=1.088, grad_norm=80.611, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.186, optim0_lr0=1.219e-04, train_time=9.386 +[gpub001:0/64] 2023-07-04 05:59:46,877 (trainer:732) INFO: 10epoch:train:4101-4200batch: iter_time=1.047e-04, forward_time=0.146, loss_ctc=76.833, loss_att=56.960, acc=0.666, loss=62.922, backward_time=1.029, grad_norm=98.324, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.219e-04, train_time=2.719 +[gpub001:0/64] 2023-07-04 06:02:08,031 (trainer:732) INFO: 10epoch:train:4201-4300batch: iter_time=1.282e-04, forward_time=0.149, loss_ctc=77.560, loss_att=57.872, acc=0.688, loss=63.779, backward_time=1.038, grad_norm=85.425, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.218e-04, train_time=2.823 +[gpub001:0/64] 2023-07-04 06:04:26,653 (trainer:732) INFO: 10epoch:train:4301-4400batch: iter_time=1.141e-04, forward_time=0.147, loss_ctc=92.369, loss_att=84.411, acc=0.657, loss=86.799, backward_time=1.034, grad_norm=104.688, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.217e-04, train_time=2.772 +[gpub001:0/64] 2023-07-04 06:06:42,171 (trainer:732) INFO: 10epoch:train:4401-4500batch: iter_time=1.068e-04, forward_time=0.146, loss_ctc=80.486, loss_att=61.536, acc=0.660, loss=67.221, backward_time=1.025, grad_norm=95.744, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.217e-04, train_time=2.710 +[gpub001:0/64] 2023-07-04 06:08:58,348 (trainer:732) INFO: 10epoch:train:4501-4600batch: iter_time=1.054e-04, forward_time=0.147, loss_ctc=80.334, loss_att=58.029, acc=0.703, loss=64.720, backward_time=1.031, grad_norm=100.954, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.216e-04, train_time=2.723 +[gpub001:0/64] 2023-07-04 06:11:28,165 (trainer:732) INFO: 10epoch:train:4601-4700batch: iter_time=1.078e-04, forward_time=0.146, loss_ctc=77.401, loss_att=63.987, acc=0.673, loss=68.011, backward_time=1.064, grad_norm=87.364, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.215e-04, train_time=2.996 +[gpub001:0/64] 2023-07-04 06:14:13,116 (trainer:732) INFO: 10epoch:train:4701-4800batch: iter_time=1.077e-04, forward_time=0.147, loss_ctc=66.979, loss_att=52.793, acc=0.677, loss=57.049, backward_time=1.061, grad_norm=71.593, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.214e-04, train_time=3.299 +[gpub001:0/64] 2023-07-04 06:16:43,533 (trainer:732) INFO: 10epoch:train:4801-4900batch: iter_time=1.076e-04, forward_time=0.162, loss_ctc=83.390, loss_att=59.533, acc=0.672, loss=66.690, backward_time=1.046, grad_norm=101.548, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.214e-04, train_time=3.008 +[gpub001:0/64] 2023-07-04 06:19:33,266 (trainer:732) INFO: 10epoch:train:4901-5000batch: iter_time=5.750e-04, forward_time=0.213, loss_ctc=76.339, loss_att=59.292, acc=0.682, loss=64.406, backward_time=1.106, grad_norm=94.146, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.187, optim0_lr0=1.213e-04, train_time=3.394 +[gpub001:0/64] 2023-07-04 06:19:53,288 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/64] 2023-07-04 06:20:15,638 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 06:20:19,922 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 06:20:19,922 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/64] 2023-07-04 06:20:19,929 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 06:27:20,326 (trainer:732) INFO: 10epoch:train:5001-5100batch: iter_time=2.350, forward_time=0.170, loss_ctc=71.177, loss_att=57.991, acc=0.682, loss=61.947, backward_time=1.051, grad_norm=82.378, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.185, optim0_lr0=1.212e-04, train_time=9.341 +[gpub001:0/64] 2023-07-04 06:29:40,314 (trainer:732) INFO: 10epoch:train:5101-5200batch: iter_time=1.100e-04, forward_time=0.146, loss_ctc=75.335, loss_att=55.900, acc=0.661, loss=61.731, backward_time=1.032, grad_norm=90.481, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.212e-04, train_time=2.800 +[gpub001:0/64] 2023-07-04 06:31:57,088 (trainer:732) INFO: 10epoch:train:5201-5300batch: iter_time=1.165e-04, forward_time=0.146, loss_ctc=79.123, loss_att=58.087, acc=0.682, loss=64.398, backward_time=1.031, grad_norm=88.409, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.211e-04, train_time=2.735 +[gpub001:0/64] 2023-07-04 06:34:23,229 (trainer:732) INFO: 10epoch:train:5301-5400batch: iter_time=1.130e-04, forward_time=0.147, loss_ctc=90.993, loss_att=83.458, acc=0.647, loss=85.719, backward_time=1.042, grad_norm=94.734, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.210e-04, train_time=2.923 +[gpub001:0/64] 2023-07-04 06:36:38,714 (trainer:732) INFO: 10epoch:train:5401-5500batch: iter_time=1.152e-04, forward_time=0.145, loss_ctc=80.319, loss_att=60.819, acc=0.653, loss=66.669, backward_time=1.025, grad_norm=93.349, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.209e-04, train_time=2.709 +[gpub001:0/64] 2023-07-04 06:38:58,311 (trainer:732) INFO: 10epoch:train:5501-5600batch: iter_time=1.103e-04, forward_time=0.145, loss_ctc=79.823, loss_att=56.920, acc=0.697, loss=63.791, backward_time=1.031, grad_norm=86.237, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.209e-04, train_time=2.792 +[gpub001:0/64] 2023-07-04 06:41:19,825 (trainer:732) INFO: 10epoch:train:5601-5700batch: iter_time=1.113e-04, forward_time=0.144, loss_ctc=79.289, loss_att=64.586, acc=0.666, loss=68.997, backward_time=1.035, grad_norm=117.701, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.208e-04, train_time=2.830 +[gpub001:0/64] 2023-07-04 06:43:36,922 (trainer:732) INFO: 10epoch:train:5701-5800batch: iter_time=1.127e-04, forward_time=0.146, loss_ctc=65.745, loss_att=51.767, acc=0.674, loss=55.961, backward_time=1.030, grad_norm=84.338, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.207e-04, train_time=2.742 +[gpub001:0/64] 2023-07-04 06:46:10,640 (trainer:732) INFO: 10epoch:train:5801-5900batch: iter_time=5.857e-04, forward_time=0.153, loss_ctc=83.685, loss_att=57.586, acc=0.660, loss=65.415, backward_time=1.047, grad_norm=90.102, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.207e-04, train_time=3.074 +[gpub001:0/64] 2023-07-04 06:48:48,486 (trainer:732) INFO: 10epoch:train:5901-6000batch: iter_time=1.104e-04, forward_time=0.190, loss_ctc=76.571, loss_att=62.067, acc=0.674, loss=66.418, backward_time=1.059, grad_norm=83.226, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.206e-04, train_time=3.157 +[gpub001:0/64] 2023-07-04 06:48:54,345 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/64] 2023-07-04 06:49:17,118 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 06:49:21,432 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 06:49:21,432 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/64] 2023-07-04 06:49:21,440 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 06:54:57,248 (trainer:732) INFO: 10epoch:train:6001-6100batch: iter_time=1.887, forward_time=0.176, loss_ctc=72.668, loss_att=58.875, acc=0.681, loss=63.013, backward_time=1.047, grad_norm=78.204, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.185, optim0_lr0=1.205e-04, train_time=7.374 +[gpub001:0/64] 2023-07-04 06:57:13,494 (trainer:732) INFO: 10epoch:train:6101-6200batch: iter_time=1.222e-04, forward_time=0.147, loss_ctc=75.100, loss_att=56.125, acc=0.660, loss=61.817, backward_time=1.028, grad_norm=96.151, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.205e-04, train_time=2.726 +[gpub001:0/64] 2023-07-04 06:59:29,483 (trainer:732) INFO: 10epoch:train:6201-6300batch: iter_time=1.260e-04, forward_time=0.149, loss_ctc=78.803, loss_att=58.253, acc=0.683, loss=64.418, backward_time=1.028, grad_norm=92.452, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.204e-04, train_time=2.720 +[gpub001:0/64] 2023-07-04 07:01:57,947 (trainer:732) INFO: 10epoch:train:6301-6400batch: iter_time=1.262e-04, forward_time=0.166, loss_ctc=92.014, loss_att=84.009, acc=0.648, loss=86.411, backward_time=1.045, grad_norm=99.864, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.185, optim0_lr0=1.203e-04, train_time=2.969 +[gpub001:0/64] 2023-07-04 07:04:34,258 (trainer:732) INFO: 10epoch:train:6401-6500batch: iter_time=1.256e-04, forward_time=0.156, loss_ctc=78.738, loss_att=59.782, acc=0.657, loss=65.469, backward_time=1.119, grad_norm=101.854, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.185, optim0_lr0=1.202e-04, train_time=3.126 +[gpub001:0/64] 2023-07-04 07:06:55,517 (trainer:732) INFO: 10epoch:train:6501-6600batch: iter_time=1.241e-04, forward_time=0.168, loss_ctc=79.093, loss_att=56.679, acc=0.698, loss=63.403, backward_time=1.032, grad_norm=82.095, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.202e-04, train_time=2.825 +[gpub001:0/64] 2023-07-04 07:09:19,221 (trainer:732) INFO: 10epoch:train:6601-6700batch: iter_time=1.178e-04, forward_time=0.177, loss_ctc=77.971, loss_att=64.064, acc=0.671, loss=68.236, backward_time=1.057, grad_norm=85.100, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.201e-04, train_time=2.874 +[gpub001:0/64] 2023-07-04 07:11:39,694 (trainer:732) INFO: 10epoch:train:6701-6800batch: iter_time=1.091e-04, forward_time=0.146, loss_ctc=66.743, loss_att=52.391, acc=0.671, loss=56.697, backward_time=1.034, grad_norm=88.159, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.200e-04, train_time=2.809 +[gpub001:0/64] 2023-07-04 07:14:01,640 (trainer:732) INFO: 10epoch:train:6801-6900batch: iter_time=1.103e-04, forward_time=0.147, loss_ctc=82.598, loss_att=57.601, acc=0.667, loss=65.100, backward_time=1.035, grad_norm=88.853, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.200e-04, train_time=2.839 +[gpub001:0/64] 2023-07-04 07:16:24,881 (trainer:732) INFO: 10epoch:train:6901-7000batch: iter_time=1.054e-04, forward_time=0.147, loss_ctc=74.921, loss_att=60.937, acc=0.676, loss=65.132, backward_time=1.031, grad_norm=86.997, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.199e-04, train_time=2.865 +[gpub001:0/64] 2023-07-04 07:16:37,979 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/64] 2023-07-04 07:17:00,253 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 07:17:04,527 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 07:17:04,527 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/64] 2023-07-04 07:17:04,630 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 07:25:43,021 (trainer:732) INFO: 10epoch:train:7001-7100batch: iter_time=2.361, forward_time=0.227, loss_ctc=70.630, loss_att=58.023, acc=0.692, loss=61.805, backward_time=1.046, grad_norm=83.473, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.188, optim0_lr0=1.198e-04, train_time=11.161 +[gpub001:0/64] 2023-07-04 07:27:58,795 (trainer:732) INFO: 10epoch:train:7101-7200batch: iter_time=1.384e-04, forward_time=0.145, loss_ctc=74.050, loss_att=55.642, acc=0.672, loss=61.165, backward_time=1.026, grad_norm=90.165, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.198e-04, train_time=2.717 +[gpub001:0/64] 2023-07-04 07:30:21,144 (trainer:732) INFO: 10epoch:train:7201-7300batch: iter_time=1.130e-04, forward_time=0.146, loss_ctc=77.786, loss_att=57.538, acc=0.689, loss=63.612, backward_time=1.035, grad_norm=81.644, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.197e-04, train_time=2.847 +[gpub001:0/64] 2023-07-04 07:32:41,864 (trainer:732) INFO: 10epoch:train:7301-7400batch: iter_time=9.930e-05, forward_time=0.147, loss_ctc=90.070, loss_att=83.807, acc=0.661, loss=85.686, backward_time=1.045, grad_norm=98.326, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.196e-04, train_time=2.814 +[gpub001:0/64] 2023-07-04 07:35:08,429 (trainer:732) INFO: 10epoch:train:7401-7500batch: iter_time=1.019e-04, forward_time=0.145, loss_ctc=79.856, loss_att=61.766, acc=0.661, loss=67.193, backward_time=1.040, grad_norm=89.441, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.196e-04, train_time=2.931 +[gpub001:0/64] 2023-07-04 07:37:26,655 (trainer:732) INFO: 10epoch:train:7501-7600batch: iter_time=1.190e-04, forward_time=0.145, loss_ctc=81.266, loss_att=58.514, acc=0.702, loss=65.340, backward_time=1.030, grad_norm=92.735, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.195e-04, train_time=2.764 +[gpub001:0/64] 2023-07-04 07:40:02,165 (trainer:732) INFO: 10epoch:train:7601-7700batch: iter_time=5.351e-04, forward_time=0.147, loss_ctc=76.333, loss_att=62.310, acc=0.677, loss=66.517, backward_time=1.056, grad_norm=79.911, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.194e-04, train_time=3.110 +[gpub001:0/64] 2023-07-04 07:42:23,167 (trainer:732) INFO: 10epoch:train:7701-7800batch: iter_time=9.641e-05, forward_time=0.145, loss_ctc=66.356, loss_att=52.050, acc=0.681, loss=56.342, backward_time=1.039, grad_norm=73.483, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.183, optim0_lr0=1.194e-04, train_time=2.820 +[gpub001:0/64] 2023-07-04 07:44:57,556 (trainer:732) INFO: 10epoch:train:7801-7900batch: iter_time=9.989e-05, forward_time=0.155, loss_ctc=81.235, loss_att=57.248, acc=0.677, loss=64.444, backward_time=1.065, grad_norm=86.881, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.193e-04, train_time=3.088 +[gpub001:0/64] 2023-07-04 07:48:09,978 (trainer:732) INFO: 10epoch:train:7901-8000batch: iter_time=1.022e-04, forward_time=0.170, loss_ctc=76.555, loss_att=60.229, acc=0.685, loss=65.127, backward_time=1.097, grad_norm=79.075, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.192e-04, train_time=3.848 +[gpub001:0/64] 2023-07-04 07:48:25,594 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/64] 2023-07-04 07:48:48,049 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 07:48:52,289 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 07:48:52,289 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/64] 2023-07-04 07:48:52,297 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 07:55:59,186 (trainer:732) INFO: 10epoch:train:8001-8100batch: iter_time=2.602, forward_time=0.191, loss_ctc=70.351, loss_att=57.246, acc=0.695, loss=61.178, backward_time=1.076, grad_norm=77.703, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.185, optim0_lr0=1.191e-04, train_time=9.384 +[gpub001:0/64] 2023-07-04 07:58:31,433 (trainer:732) INFO: 10epoch:train:8101-8200batch: iter_time=1.315e-04, forward_time=0.146, loss_ctc=74.708, loss_att=54.669, acc=0.672, loss=60.681, backward_time=1.055, grad_norm=103.432, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.191e-04, train_time=3.045 +[gpub001:0/64] 2023-07-04 08:01:03,790 (trainer:732) INFO: 10epoch:train:8201-8300batch: iter_time=1.264e-04, forward_time=0.147, loss_ctc=77.423, loss_att=57.831, acc=0.689, loss=63.709, backward_time=1.050, grad_norm=101.802, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.190e-04, train_time=3.047 +[gpub001:0/64] 2023-07-04 08:03:43,672 (trainer:732) INFO: 10epoch:train:8301-8400batch: iter_time=1.320e-04, forward_time=0.146, loss_ctc=90.631, loss_att=83.753, acc=0.660, loss=85.816, backward_time=1.067, grad_norm=95.528, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.189e-04, train_time=3.197 +[gpub001:0/64] 2023-07-04 08:06:18,852 (trainer:732) INFO: 10epoch:train:8401-8500batch: iter_time=1.261e-04, forward_time=0.147, loss_ctc=78.090, loss_att=59.472, acc=0.668, loss=65.057, backward_time=1.065, grad_norm=97.820, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.189e-04, train_time=3.103 +[gpub001:0/64] 2023-07-04 08:08:44,080 (trainer:732) INFO: 10epoch:train:8501-8600batch: iter_time=1.136e-04, forward_time=0.147, loss_ctc=81.800, loss_att=58.282, acc=0.704, loss=65.338, backward_time=1.043, grad_norm=86.480, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.188e-04, train_time=2.904 +[gpub001:0/64] 2023-07-04 08:11:19,247 (trainer:732) INFO: 10epoch:train:8601-8700batch: iter_time=1.271e-04, forward_time=0.146, loss_ctc=77.659, loss_att=64.031, acc=0.675, loss=68.119, backward_time=1.051, grad_norm=85.748, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.187e-04, train_time=3.103 +[gpub001:0/64] 2023-07-04 08:14:02,057 (trainer:732) INFO: 10epoch:train:8701-8800batch: iter_time=1.326e-04, forward_time=0.146, loss_ctc=66.799, loss_att=52.036, acc=0.681, loss=56.465, backward_time=1.059, grad_norm=73.862, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.187e-04, train_time=3.256 +[gpub001:0/64] 2023-07-04 08:16:37,378 (trainer:732) INFO: 10epoch:train:8801-8900batch: iter_time=1.286e-04, forward_time=0.146, loss_ctc=82.147, loss_att=57.347, acc=0.680, loss=64.787, backward_time=1.052, grad_norm=89.496, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.186e-04, train_time=3.106 +[gpub001:0/64] 2023-07-04 08:19:08,726 (trainer:732) INFO: 10epoch:train:8901-9000batch: iter_time=1.263e-04, forward_time=0.146, loss_ctc=75.952, loss_att=59.955, acc=0.682, loss=64.755, backward_time=1.047, grad_norm=84.576, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.185e-04, train_time=3.027 +[gpub001:0/64] 2023-07-04 08:19:28,754 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/64] 2023-07-04 08:19:51,073 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 08:19:55,348 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 08:19:55,348 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/64] 2023-07-04 08:19:55,408 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 08:26:15,955 (trainer:732) INFO: 10epoch:train:9001-9100batch: iter_time=2.440, forward_time=0.246, loss_ctc=70.151, loss_att=57.464, acc=0.695, loss=61.270, backward_time=1.053, grad_norm=83.805, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.187, optim0_lr0=1.185e-04, train_time=8.544 +[gpub001:0/64] 2023-07-04 08:28:32,451 (trainer:732) INFO: 10epoch:train:9101-9200batch: iter_time=1.067e-04, forward_time=0.145, loss_ctc=75.009, loss_att=55.262, acc=0.672, loss=61.186, backward_time=1.028, grad_norm=94.917, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.184e-04, train_time=2.730 +[gpub001:0/64] 2023-07-04 08:30:51,931 (trainer:732) INFO: 10epoch:train:9201-9300batch: iter_time=1.173e-04, forward_time=0.148, loss_ctc=77.181, loss_att=56.772, acc=0.694, loss=62.895, backward_time=1.031, grad_norm=83.826, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.184, optim0_lr0=1.183e-04, train_time=2.789 +[gpub001:0/64] 2023-07-04 08:33:11,956 (trainer:732) INFO: 10epoch:train:9301-9400batch: iter_time=1.225e-04, forward_time=0.149, loss_ctc=87.476, loss_att=81.540, acc=0.668, loss=83.321, backward_time=1.037, grad_norm=99.964, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.183e-04, train_time=2.800 +[gpub001:0/64] 2023-07-04 08:35:38,898 (trainer:732) INFO: 10epoch:train:9401-9500batch: iter_time=1.206e-04, forward_time=0.147, loss_ctc=78.958, loss_att=59.832, acc=0.663, loss=65.570, backward_time=1.040, grad_norm=101.896, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.182e-04, train_time=2.939 +[gpub001:0/64] 2023-07-04 08:38:05,571 (trainer:732) INFO: 10epoch:train:9501-9600batch: iter_time=1.192e-04, forward_time=0.147, loss_ctc=79.300, loss_att=57.356, acc=0.708, loss=63.939, backward_time=1.043, grad_norm=85.796, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.181e-04, train_time=2.933 +[gpub001:0/64] 2023-07-04 08:40:51,798 (trainer:732) INFO: 10epoch:train:9601-9700batch: iter_time=1.288e-04, forward_time=0.148, loss_ctc=78.120, loss_att=64.613, acc=0.677, loss=68.665, backward_time=1.077, grad_norm=116.603, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.181e-04, train_time=3.324 +[gpub001:0/64] 2023-07-04 08:43:38,644 (trainer:732) INFO: 10epoch:train:9701-9800batch: iter_time=1.182e-04, forward_time=0.147, loss_ctc=66.807, loss_att=51.983, acc=0.683, loss=56.430, backward_time=1.086, grad_norm=86.590, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.180e-04, train_time=3.337 +[gpub001:0/64] 2023-07-04 08:46:13,955 (trainer:732) INFO: 10epoch:train:9801-9900batch: iter_time=1.012e-04, forward_time=0.147, loss_ctc=81.321, loss_att=58.560, acc=0.675, loss=65.388, backward_time=1.046, grad_norm=87.678, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.179e-04, train_time=3.106 +[gpub001:0/64] 2023-07-04 08:48:56,080 (trainer:732) INFO: 10epoch:train:9901-10000batch: iter_time=1.048e-04, forward_time=0.146, loss_ctc=75.337, loss_att=58.584, acc=0.688, loss=63.610, backward_time=1.058, grad_norm=81.275, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.179e-04, train_time=3.242 +[gpub001:0/64] 2023-07-04 09:01:39,700 (trainer:338) INFO: 10epoch results: [train] iter_time=0.259, forward_time=0.156, loss_ctc=78.576, loss_att=61.451, acc=0.672, loss=66.588, backward_time=1.054, grad_norm=91.032, clip=100.000, loss_scale=1.374e+11, optim_step_time=0.184, optim0_lr0=1.213e-04, train_time=3.698, time=5 hours, 8 minutes and 33.64 seconds, total_count=70000, gpu_max_cached_mem_GB=37.459, [valid] loss_ctc=58.858, cer_ctc=0.319, loss_att=49.021, acc=0.607, cer=0.458, wer=1.000, loss=51.972, time=6 minutes and 30.14 seconds, total_count=7590, gpu_max_cached_mem_GB=37.459, [att_plot] time=5 minutes and 50.7 seconds, total_count=0, gpu_max_cached_mem_GB=37.459 +[gpub001:0/64] 2023-07-04 09:01:59,109 (trainer:386) INFO: The best model has been updated: valid.total_count +[gpub001:0/64] 2023-07-04 09:02:00,803 (average_nbest_models:69) INFO: Averaging 5best models: criterion="valid.acc": exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.acc.ave_5best.till10epoch.pth +[gpub001:0/64] 2023-07-04 09:02:44,194 (average_nbest_models:69) INFO: Averaging 5best models: criterion="valid.total_count": exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/valid.total_count.ave_5best.till10epoch.pth +[gpub001:0/64] 2023-07-04 09:02:51,878 (trainer:440) INFO: The model files were removed: exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/5epoch.pth +[gpub001:0/64] 2023-07-04 09:02:51,945 (trainer:272) INFO: 11/100epoch started. Estimated time to finish: 2 weeks, 5 days and 19 hours +[gpub001:0/64] 2023-07-04 09:02:53,579 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/64] 2023-07-04 09:03:16,753 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 09:03:21,037 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 09:03:21,038 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/64] 2023-07-04 09:03:21,282 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +[gpub001:0/64] 2023-07-04 09:11:53,137 (trainer:732) INFO: 11epoch:train:1-100batch: iter_time=3.940, forward_time=0.203, loss_ctc=69.654, loss_att=54.166, acc=0.663, loss=58.813, backward_time=1.045, grad_norm=79.216, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.187, optim0_lr0=1.178e-04, train_time=10.802 +[gpub001:0/64] 2023-07-04 09:14:10,915 (trainer:732) INFO: 11epoch:train:101-200batch: iter_time=1.300e-04, forward_time=0.146, loss_ctc=88.708, loss_att=62.409, acc=0.670, loss=70.299, backward_time=1.030, grad_norm=109.268, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.184, optim0_lr0=1.178e-04, train_time=2.755 +[gpub001:0/64] 2023-07-04 09:16:26,411 (trainer:732) INFO: 11epoch:train:201-300batch: iter_time=1.304e-04, forward_time=0.144, loss_ctc=75.382, loss_att=62.579, acc=0.655, loss=66.420, backward_time=1.027, grad_norm=83.304, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.177e-04, train_time=2.710 +[gpub001:0/64] 2023-07-04 09:18:42,130 (trainer:732) INFO: 11epoch:train:301-400batch: iter_time=1.207e-04, forward_time=0.144, loss_ctc=78.822, loss_att=59.346, acc=0.668, loss=65.189, backward_time=1.026, grad_norm=100.773, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.176e-04, train_time=2.714 +[gpub001:0/64] 2023-07-04 09:21:02,338 (trainer:732) INFO: 11epoch:train:401-500batch: iter_time=1.325e-04, forward_time=0.145, loss_ctc=76.393, loss_att=64.269, acc=0.659, loss=67.906, backward_time=1.036, grad_norm=116.385, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.184, optim0_lr0=1.176e-04, train_time=2.804 +[gpub001:0/64] 2023-07-04 09:23:33,454 (trainer:732) INFO: 11epoch:train:501-600batch: iter_time=1.256e-04, forward_time=0.144, loss_ctc=76.583, loss_att=58.803, acc=0.655, loss=64.137, backward_time=1.050, grad_norm=86.037, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.175e-04, train_time=3.022 +[gpub001:0/64] 2023-07-04 09:26:08,811 (trainer:732) INFO: 11epoch:train:601-700batch: iter_time=1.229e-04, forward_time=0.146, loss_ctc=91.392, loss_att=65.716, acc=0.662, loss=73.419, backward_time=1.075, grad_norm=105.260, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.174e-04, train_time=3.107 +[gpub001:0/64] 2023-07-04 09:28:29,823 (trainer:732) INFO: 11epoch:train:701-800batch: iter_time=1.309e-04, forward_time=0.144, loss_ctc=80.913, loss_att=63.348, acc=0.638, loss=68.618, backward_time=1.034, grad_norm=106.285, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.174e-04, train_time=2.820 +[gpub001:0/64] 2023-07-04 09:31:05,097 (trainer:732) INFO: 11epoch:train:801-900batch: iter_time=3.980e-04, forward_time=0.269, loss_ctc=82.733, loss_att=69.706, acc=0.657, loss=73.614, backward_time=1.058, grad_norm=122.936, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.189, optim0_lr0=1.173e-04, train_time=3.105 +[gpub001:0/64] 2023-07-04 09:33:40,369 (trainer:732) INFO: 11epoch:train:901-1000batch: iter_time=1.140e-04, forward_time=0.148, loss_ctc=67.176, loss_att=55.082, acc=0.663, loss=58.710, backward_time=1.066, grad_norm=79.860, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.183, optim0_lr0=1.172e-04, train_time=3.105 +[gpub001:0/64] 2023-07-04 09:33:57,353 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/64] 2023-07-04 09:34:19,685 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/64] 2023-07-04 09:34:23,885 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/64] 2023-07-04 09:34:23,885 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=45593, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/64] 2023-07-04 09:34:23,892 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=45593, mean=128.0, min=128, max=129 +Traceback (most recent call last): + File "", line 1, in + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 140, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGKILL +Traceback (most recent call last): + File "", line 1, in + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +Traceback (most recent call last): + File "", line 1, in + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +Traceback (most recent call last): + File "", line 1, in + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +slurmstepd: error: Detected 1 oom-kill event(s) in StepId=2121665.0. Some of your processes may have been killed by the cgroup out-of-memory handler. +srun: error: gpub001: task 0: Out Of Memory +gpub022:3399535:3399624 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub022:3399536:3399623 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub022:3399537:3399622 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub022:3399534:3399625 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub022:3399536:3399536 [2] NCCL INFO comm 0x93f2210 rank 18 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 43] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 42] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 50] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 49] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 40] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 41] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 48] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 51] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub076:3343846:3343926 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub076:3343843:3343928 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub076:3343845:3343927 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub066:1432047:1432134 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub066:1432046:1432136 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub066:1432048:1432137 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub022:3399537:3399537 [3] NCCL INFO comm 0x50214710 rank 19 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub022:3399535:3399535 [1] NCCL INFO comm 0x4fa312f0 rank 17 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub022:3399534:3399534 [0] NCCL INFO comm 0x50711f50 rank 16 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub066:1432048:1432069 [0] NCCL INFO comm 0x51126a70 rank 43 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub066:1432047:1432070 [0] NCCL INFO comm 0x9ed0150 rank 42 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub076:3343845:3343867 [0] NCCL INFO comm 0x4fe2ad90 rank 50 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub076:3343846:3343866 [0] NCCL INFO comm 0x50888c10 rank 51 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub066:1432046:1432068 [0] NCCL INFO comm 0x4fabed20 rank 41 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub076:3343844:3343868 [0] NCCL INFO comm 0xb838ee00 rank 49 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub076:3343843:3343869 [0] NCCL INFO comm 0x508de3f0 rank 48 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub066:1432045:1432071 [0] NCCL INFO comm 0x50653520 rank 40 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 51] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 19] Caught collective operation timeout: WorkNCCL(SeqNum=2076236, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800147 milliseconds before timing out. +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 48] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 50] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 49] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 18] Caught collective operation timeout: WorkNCCL(SeqNum=2076236, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800146 milliseconds before timing out. +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 16] Caught collective operation timeout: WorkNCCL(SeqNum=2076236, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800157 milliseconds before timing out. +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 17] Caught collective operation timeout: WorkNCCL(SeqNum=2076236, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800147 milliseconds before timing out. +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 42] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 43] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 41] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 40] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 17. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 27] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 46] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 44] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 25] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 47] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 59] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 24] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 58] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 57] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 26] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 45] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub031:1878314:1878398 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub031:1878313:1878397 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub079:2616804:2616888 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub079:2616805:2616890 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub067:1390514:1390593 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub067:1390516:1390596 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub067:1390513:1390595 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +[W ProcessGroupNCCL.cpp:948] [Rank 56] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub067:1390516:1390538 [0] NCCL INFO comm 0x509fc1c0 rank 47 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub079:2616806:2616829 [0] NCCL INFO comm 0x89762f0 rank 59 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub031:1878312:1878336 [0] NCCL INFO comm 0x509faf60 rank 25 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub079:2616805:2616827 [0] NCCL INFO comm 0x8b2c9c20 rank 58 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub067:1390514:1390537 [0] NCCL INFO comm 0xa70b75d0 rank 45 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub031:1878313:1878334 [0] NCCL INFO comm 0xa54f400 rank 26 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub067:1390513:1390536 [0] NCCL INFO comm 0x4ef73970 rank 44 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub079:2616804:2616830 [0] NCCL INFO comm 0x9014adc0 rank 57 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub067:1390515:1390535 [0] NCCL INFO comm 0x5030f0d0 rank 46 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub031:1878314:1878335 [0] NCCL INFO comm 0x511daaa0 rank 27 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub079:2616803:2616828 [0] NCCL INFO comm 0xa9779a50 rank 56 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub031:1878311:1878337 [0] NCCL INFO comm 0xba515710 rank 24 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 27] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 59] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 58] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 46] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 56] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 57] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 44] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 25] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 26] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 45] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 24] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 47] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 16. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 14] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 15] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 13] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 12] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub016:1380823:1380905 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub016:1380823:1380846 [0] NCCL INFO comm 0x517fee10 rank 14 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 54] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 55] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 52] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 53] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub016:1380824:1380845 [0] NCCL INFO comm 0x8d241cc0 rank 15 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub016:1380822:1380843 [0] NCCL INFO comm 0x9b8bb7a0 rank 13 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub077:252894:252972 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub077:252895:252971 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub077:252894:252916 [0] NCCL INFO comm 0xc19a4b40 rank 54 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub077:252893:252970 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub077:252893:252914 [0] NCCL INFO comm 0x509e6280 rank 53 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub077:252895:252913 [0] NCCL INFO comm 0x9491900 rank 55 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub077:252892:252969 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub077:252892:252915 [0] NCCL INFO comm 0x97aafd0 rank 52 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub016:1380821:1380844 [0] NCCL INFO comm 0x50896990 rank 12 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 14] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 54] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 53] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 13] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 55] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 15] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 52] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 21] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 23] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 22] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub030:2310660:2310736 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +[W ProcessGroupNCCL.cpp:948] [Rank 20] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub030:2310660:2310681 [0] NCCL INFO comm 0xa84d3a10 rank 23 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 12] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub030:2310658:2310733 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub030:2310659:2310682 [0] NCCL INFO comm 0x8de12f60 rank 22 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub030:2310658:2310680 [0] NCCL INFO comm 0x50672d50 rank 21 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 38] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 39] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 37] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 36] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub060:1938146:1938228 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub060:1938145:1938226 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub060:1938144:1938225 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub060:1938146:1938170 [0] NCCL INFO comm 0x50addeb0 rank 39 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub060:1938145:1938171 [0] NCCL INFO comm 0xb591e2d0 rank 38 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub060:1938144:1938172 [0] NCCL INFO comm 0x4f3bc650 rank 37 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub030:2310657:2310679 [0] NCCL INFO comm 0x50d929d0 rank 20 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 62] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 63] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub096:1440104:1440184 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub096:1440103:1440185 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +[W ProcessGroupNCCL.cpp:948] [Rank 60] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 61] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub096:1440102:1440186 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub096:1440103:1440125 [0] NCCL INFO comm 0x91c6060 rank 62 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub096:1440104:1440126 [0] NCCL INFO comm 0x9f265ce0 rank 63 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub096:1440102:1440128 [0] NCCL INFO comm 0x50d96930 rank 61 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub032:3246893:3246982 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub032:3246893:3246893 [1] NCCL INFO comm 0x9a6ad00 rank 29 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 23] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 21] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 38] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 22] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub060:1938143:1938169 [0] NCCL INFO comm 0x50561020 rank 36 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 20] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub096:1440101:1440127 [0] NCCL INFO comm 0x50b020d0 rank 60 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 39] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 37] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 35] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 33] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +[W ProcessGroupNCCL.cpp:948] [Rank 32] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 62] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 11] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub059:1894384:1894465 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub059:1894386:1894467 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +[W ProcessGroupNCCL.cpp:948] [Rank 9] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 8] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub059:1894386:1894405 [0] NCCL INFO comm 0x9cf1390 rank 35 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 10] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 63] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 34] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub059:1894385:1894466 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 29] Caught collective operation timeout: WorkNCCL(SeqNum=2076236, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800044 milliseconds before timing out. +gpub059:1894385:1894403 [0] NCCL INFO comm 0x50af3510 rank 34 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub015:828881:828961 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub015:828878:828959 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub015:828879:828960 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub015:828880:828958 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +[W ProcessGroupNCCL.cpp:948] [Rank 4] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 7] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 5] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub015:828879:828899 [0] NCCL INFO comm 0x8ad4b90 rank 9 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub059:1894384:1894406 [0] NCCL INFO comm 0xb7b49460 rank 33 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 61] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 6] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub015:828880:828901 [0] NCCL INFO comm 0x9e67ed0 rank 10 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub015:828881:828898 [0] NCCL INFO comm 0xb64dad10 rank 11 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub002:1756560:1756644 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub002:1756561:1756645 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub002:1756562:1756643 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub015:828878:828900 [0] NCCL INFO comm 0x8fc63100 rank 8 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub002:1756560:1756584 [0] NCCL INFO comm 0x17829840 rank 5 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub002:1756561:1756582 [0] NCCL INFO comm 0x51ad54d0 rank 6 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub002:1756562:1756583 [0] NCCL INFO comm 0x9ca8ab90 rank 7 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub002:1756559:1756646 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub002:1756559:1756581 [0] NCCL INFO comm 0x51930090 rank 4 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 36] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 60] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 18. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub059:1894383:1894404 [0] NCCL INFO comm 0x510467d0 rank 32 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 35] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +[W ProcessGroupNCCL.cpp:948] [Rank 30] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 9] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 31] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 28] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub032:3246895:3246981 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub032:3246895:3246917 [0] NCCL INFO comm 0x1b5e5670 rank 31 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub032:3246894:3246916 [0] NCCL INFO comm 0x9ddee7e0 rank 30 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 5] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 10] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 7] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 33] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 6] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 8] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 11] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 4] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 34. Original reason for failure was: [Rank 34] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 32] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub032:3246892:3246918 [0] NCCL INFO comm 0x4ff3dba0 rank 28 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 31] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 30] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 28] Found key in store: NCCLABORTEDCOMM:20c6b5ac1c1765000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 29. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +srun: error: gpub067: task 11: Exited with exit code 1 +srun: error: gpub022: task 4: Exited with exit code 1 +srun: error: gpub031: task 6: Exited with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 +srun: error: gpub096: task 15: Exited with exit code 1 +srun: error: gpub032: task 7: Exited with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 +srun: error: gpub066: task 10: Exited with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + main() + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + main() + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + S2TTask.main(cmd=cmd) + S2TTask.main(cmd=cmd) + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + while not ProcessContext(processes, error_queues).join(): + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 + raise ProcessExitedException( + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 + raise ProcessExitedException( + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + return _run_code(code, main_globals, None, + raise ProcessExitedException( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 +srun: error: gpub060: task 9: Exited with exit code 1 +srun: error: gpub076: task 12: Exited with exit code 1 +srun: error: gpub079: task 14: Exited with exit code 1 +srun: error: gpub077: task 13: Exited with exit code 1 +srun: error: gpub059: task 8: Exited with exit code 1 +srun: error: gpub030: task 5: Exited with exit code 1 +srun: error: gpub002: task 1: Exited with exit code 1 +srun: error: gpub016: task 3: Exited with exit code 1 +srun: error: gpub015: task 2: Exited with exit code 1 +# Accounting: begin_time=1688441050 +# Accounting: end_time=1688483245 +# Accounting: time=42195 threads=1 +# Finished at Tue Jul 4 10:07:25 CDT 2023 with status 1