diff --git "a/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.8.log" "b/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.8.log" new file mode 100644--- /dev/null +++ "b/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.8.log" @@ -0,0 +1,4904 @@ +# Running on gpub015.delta.ncsa.illinois.edu +# Started at Tue Jul 4 13:05:28 CDT 2023 +# SLURMD_NODENAME=gpub015 +# SLURM_CLUSTER_NAME=delta +# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf +# SLURM_CPUS_ON_NODE=64 +# SLURM_CPUS_PER_TASK=64 +# SLURM_EXPORT_ENV=PATH +# SLURM_GET_USER_ENV=1 +# SLURM_GPUS_ON_NODE=4 +# SLURM_GTIDS=0 +# SLURM_JOBID=2127681 +# SLURM_JOB_ACCOUNT=bbjs-delta-gpu +# SLURM_JOB_CPUS_PER_NODE='64(x16)' +# SLURM_JOB_GID=202 +# SLURM_JOB_GPUS=0,1,2,3 +# SLURM_JOB_ID=2127681 +# SLURM_JOB_NAME=exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.log +# SLURM_JOB_NODELIST='gpub[015,026,031-032,036-037,049-053,078-082]' +# SLURM_JOB_NUM_NODES=16 +# SLURM_JOB_PARTITION=gpuA40x4 +# SLURM_JOB_QOS=bbjs-delta-gpu +# SLURM_JOB_UID=68077 +# SLURM_JOB_USER=peng6 +# SLURM_LOCALID=0 +# SLURM_MEM_PER_NODE=240000 +# SLURM_NNODES=16 +# SLURM_NODEID=0 +# SLURM_NODELIST='gpub[015,026,031-032,036-037,049-053,078-082]' +# SLURM_NODE_ALIASES='(null)' +# SLURM_OPEN_MODE=a +# SLURM_PRIO_PROCESS=0 +# SLURM_PROCID=0 +# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1 +# SLURM_SUBMIT_HOST=dt-login02.delta.internal.ncsa.edu +# SLURM_TASKS_PER_NODE='1(x16)' +# SLURM_TASK_PID=879691 +# SLURM_TOPOLOGY_ADDR=ss00.ss09.gpub015 +# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node +# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9728:109 +# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_c90a07cb-a80d-424b-bc3c-f044f91f1dea +[gpub015:0/64] 2023-07-04 13:08:42,453 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[gpub015:0/64] 2023-07-04 13:08:43,721 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 64 nodes. +[gpub015:0/64] 2023-07-04 13:08:43,750 (s2t:483) INFO: Vocabulary size: 50002 +[gpub015:0/64] 2023-07-04 13:08:58,245 (abs_task:1201) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True +[gpub015:0/64] 2023-07-04 13:08:58,254 (abs_task:1202) INFO: Model structure: +ESPnetS2TModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (specaug): SpecAug( + (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) + (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) + ) + (normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): TransformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=19456, out_features=1024, bias=True) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (12): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (13): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (14): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (15): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (16): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (17): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (18): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (19): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (20): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (21): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (22): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (23): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(50002, 1024) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=1024, out_features=50002, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (12): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (13): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (14): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (15): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (16): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (17): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (18): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (19): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (20): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (21): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (22): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (23): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) + (ctc): CTC( + (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) + (ctc_loss): CTCLoss() + ) +) + +Model summary: + Class Name: ESPnetS2TModel + Total Number of model parameters: 888.51 M + Number of trainable parameters: 888.51 M (100.0%) + Size: 3.55 GB + Type: torch.float32 +[gpub015:0/64] 2023-07-04 13:08:58,254 (abs_task:1205) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.9, 0.98] + capturable: False + eps: 1e-06 + foreach: None + initial_lr: 0.00025 + lr: 2.5e-08 + maximize: False + weight_decay: 0.0 +) +[gpub015:0/64] 2023-07-04 13:08:58,254 (abs_task:1206) INFO: Scheduler: WarmupLR(warmup_steps=10000) +[gpub015:0/64] 2023-07-04 13:08:58,256 (abs_task:1215) INFO: Saving the configuration in exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/config.yaml +[gpub015:0/64] 2023-07-04 13:08:58,944 (abs_task:1272) INFO: Loading pretrained params from /scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v2/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e18_d18_lr5e-4_warmup20k_raw_bpe50000/valid.acc.ave.pth +[gpub015:0/64] 2023-07-04 13:09:08,464 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 13:09:08,613 (abs_task:1570) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev/text", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 13:09:08,613 (abs_task:1571) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=1012, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpub015:0/64] 2023-07-04 13:09:08,619 (abs_task:1572) INFO: [valid] mini-batch sizes summary: N-batch=1012, mean=128.1, min=128, max=129 +[gpub015:0/64] 2023-07-04 13:09:09,108 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 13:09:09,424 (abs_task:1570) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev/text", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 13:09:09,424 (abs_task:1571) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=129591, batch_size=1, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpub015:0/64] 2023-07-04 13:09:09,424 (abs_task:1572) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[gpub015:0/64] 2023-07-04 13:09:37,908 (trainer:159) INFO: The training was resumed using exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/checkpoint.pth +gpub015:879780:879780 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:879780:879780 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:879780:879780 [0] NCCL INFO cudaDriverVersion 12010 +NCCL version 2.14.3+cuda11.7 +[gpub015:0/64] 2023-07-04 13:09:43,209 (trainer:284) INFO: 11/100epoch started +[gpub015:0/64] 2023-07-04 13:09:43,268 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub015:0/64] 2023-07-04 13:10:00,590 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 13:10:03,964 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 13:10:03,964 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpub015:0/64] 2023-07-04 13:10:03,970 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +gpub032:3289606:3289606 [3] NCCL INFO cudaDriverVersion 12010 +gpub032:3289606:3289606 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3289606:3289606 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3289606:3289687 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3289606:3289687 [3] NCCL INFO Using network IB +gpub032:3289606:3289687 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub032:3289606:3289687 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] -1/-1/-1->15->14 +gpub032:3289606:3289687 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0 +gpub032:3289606:3289687 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0 +gpub032:3289606:3289687 [3] NCCL INFO Connected all rings +gpub032:3289606:3289687 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 14[85000] via P2P/IPC +gpub032:3289606:3289687 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 14[85000] via P2P/IPC +gpub032:3289606:3289687 [3] NCCL INFO Connected all trees +gpub032:3289606:3289687 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3289606:3289687 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3289606:3289687 [3] NCCL INFO comm 0x501cec20 rank 15 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub015:879783:879783 [3] NCCL INFO cudaDriverVersion 12010 +gpub015:879783:879783 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:879783:879783 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:879783:879851 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:879783:879851 [3] NCCL INFO Using network IB +gpub015:879783:879851 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub015:879783:879851 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +gpub015:879783:879851 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0 +gpub015:879783:879851 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0 +gpub015:879783:879851 [3] NCCL INFO Connected all rings +gpub015:879783:879851 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 2[85000] via P2P/IPC +gpub015:879783:879851 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 2[85000] via P2P/IPC +gpub015:879783:879851 [3] NCCL INFO Connected all trees +gpub051:2913626:2913626 [3] NCCL INFO cudaDriverVersion 12010 +gpub051:2913626:2913626 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.151<0> +gpub051:2913626:2913626 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub051:2913626:2913705 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.151<0> +gpub051:2913626:2913705 [3] NCCL INFO Using network IB +gpub051:2913626:2913705 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub051:2913626:2913705 [3] NCCL INFO Trees [0] -1/-1/-1->35->34 [1] -1/-1/-1->35->34 +gpub051:2913626:2913705 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0 +gpub051:2913626:2913705 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0 +gpub051:2913626:2913705 [3] NCCL INFO Connected all rings +gpub051:2913626:2913705 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 34[85000] via P2P/IPC +gpub051:2913626:2913705 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 34[85000] via P2P/IPC +gpub015:879783:879851 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:879783:879851 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:879783:879851 [3] NCCL INFO comm 0x5071eb50 rank 3 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub051:2913626:2913705 [3] NCCL INFO Connected all trees +gpub051:2913626:2913705 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub051:2913626:2913705 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub051:2913626:2913705 [3] NCCL INFO comm 0x9e42a10 rank 35 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub051:2913625:2913625 [2] NCCL INFO cudaDriverVersion 12010 +gpub051:2913625:2913625 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.151<0> +gpub051:2913625:2913625 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub051:2913625:2913708 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.151<0> +gpub051:2913625:2913708 [2] NCCL INFO Using network IB +gpub051:2913625:2913708 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub051:2913625:2913708 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/-1/-1->34->33 +gpub051:2913625:2913708 [2] NCCL INFO Channel 00/0 : 34[85000] -> 35[c7000] via P2P/IPC +gpub051:2913625:2913708 [2] NCCL INFO Channel 01/0 : 34[85000] -> 35[c7000] via P2P/IPC +gpub051:2913625:2913708 [2] NCCL INFO Connected all rings +gpub051:2913625:2913708 [2] NCCL INFO Channel 00/0 : 34[85000] -> 33[46000] via P2P/IPC +gpub051:2913625:2913708 [2] NCCL INFO Channel 01/0 : 34[85000] -> 33[46000] via P2P/IPC +gpub051:2913625:2913708 [2] NCCL INFO Connected all trees +gpub051:2913625:2913708 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub051:2913625:2913708 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub051:2913625:2913708 [2] NCCL INFO comm 0xb9b5ccd0 rank 34 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub080:4113204:4113204 [1] NCCL INFO cudaDriverVersion 12010 +gpub080:4113204:4113204 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:4113204:4113204 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:4113204:4113287 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:4113204:4113287 [1] NCCL INFO Using network IB +gpub080:4113204:4113287 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub080:4113204:4113287 [1] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/56/-1->53->52 +gpub080:4113204:4113287 [1] NCCL INFO Channel 00/0 : 53[46000] -> 54[85000] via P2P/IPC +gpub080:4113204:4113287 [1] NCCL INFO Channel 01/0 : 53[46000] -> 54[85000] via P2P/IPC +gpub080:4113204:4113287 [1] NCCL INFO Connected all rings +gpub080:4113204:4113287 [1] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [send] via NET/IB/0 +gpub080:4113204:4113287 [1] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [receive] via NET/IB/0 +gpub080:4113204:4113287 [1] NCCL INFO Channel 00/0 : 53[46000] -> 52[7000] via P2P/IPC +gpub080:4113204:4113287 [1] NCCL INFO Channel 01/0 : 53[46000] -> 52[7000] via P2P/IPC +gpub080:4113204:4113287 [1] NCCL INFO Connected all trees +gpub080:4113204:4113287 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub080:4113204:4113287 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:4113204:4113287 [1] NCCL INFO comm 0xb71b4bf0 rank 53 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub079:2657933:2657933 [1] NCCL INFO cudaDriverVersion 12010 +gpub079:2657933:2657933 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2657933:2657933 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2657933:2658006 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2657933:2658006 [1] NCCL INFO Using network IB +gpub079:2657933:2658006 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub079:2657933:2658006 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] 50/-1/-1->49->48 +gpub079:2657933:2658006 [1] NCCL INFO Channel 00/0 : 49[46000] -> 50[85000] via P2P/IPC +gpub079:2657933:2658006 [1] NCCL INFO Channel 01/0 : 49[46000] -> 50[85000] via P2P/IPC +gpub079:2657933:2658006 [1] NCCL INFO Connected all rings +gpub079:2657933:2658006 [1] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [receive] via NET/IB/0 +gpub079:2657933:2658006 [1] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [send] via NET/IB/0 +gpub079:2657933:2658006 [1] NCCL INFO Channel 00/0 : 49[46000] -> 48[7000] via P2P/IPC +gpub079:2657933:2658006 [1] NCCL INFO Channel 01/0 : 49[46000] -> 48[7000] via P2P/IPC +gpub079:2657933:2658006 [1] NCCL INFO Connected all trees +gpub079:2657933:2658006 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2657933:2658006 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2657933:2658006 [1] NCCL INFO comm 0x8f776d0 rank 49 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub052:1901667:1901667 [0] NCCL INFO cudaDriverVersion 12010 +gpub052:1901667:1901667 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.152<0> +gpub052:1901667:1901667 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub052:1901667:1901752 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.152<0> +gpub052:1901667:1901752 [0] NCCL INFO Using network IB +gpub052:1901667:1901752 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub052:1901667:1901752 [0] NCCL INFO Trees [0] 37/-1/-1->36->41 [1] 37/32/-1->36->44 +gpub052:1901667:1901752 [0] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 00/0 : 36[7000] -> 37[46000] via P2P/IPC +gpub052:1901667:1901752 [0] NCCL INFO Channel 01/0 : 36[7000] -> 37[46000] via P2P/IPC +gpub052:1901667:1901752 [0] NCCL INFO Connected all rings +gpub052:1901667:1901752 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [receive] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [send] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [send] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [receive] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [receive] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [send] via NET/IB/0 +gpub052:1901667:1901752 [0] NCCL INFO Connected all trees +gpub052:1901667:1901752 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub052:1901667:1901752 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub052:1901667:1901752 [0] NCCL INFO comm 0xbc2124a0 rank 36 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub031:1921204:1921204 [0] NCCL INFO cudaDriverVersion 12010 +gpub031:1921204:1921204 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1921204:1921204 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1921204:1921285 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1921204:1921285 [0] NCCL INFO Using network IB +gpub031:1921204:1921285 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub031:1921204:1921285 [0] NCCL INFO Trees [0] 9/12/-1->8->17 [1] 9/-1/-1->8->5 +gpub031:1921204:1921285 [0] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 00/0 : 8[7000] -> 9[46000] via P2P/IPC +gpub031:1921204:1921285 [0] NCCL INFO Channel 01/0 : 8[7000] -> 9[46000] via P2P/IPC +gpub031:1921204:1921285 [0] NCCL INFO Connected all rings +gpub031:1921204:1921285 [0] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [receive] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [send] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [send] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [receive] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [receive] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [send] via NET/IB/0 +gpub031:1921204:1921285 [0] NCCL INFO Connected all trees +gpub031:1921204:1921285 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1921204:1921285 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1921204:1921285 [0] NCCL INFO comm 0xb63f1750 rank 8 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub031:1921207:1921207 [3] NCCL INFO cudaDriverVersion 12010 +gpub031:1921207:1921207 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1921207:1921207 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1921207:1921286 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1921207:1921286 [3] NCCL INFO Using network IB +gpub031:1921207:1921286 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub031:1921207:1921286 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] -1/-1/-1->11->10 +gpub031:1921207:1921286 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0 +gpub031:1921207:1921286 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0 +gpub031:1921207:1921286 [3] NCCL INFO Connected all rings +gpub031:1921207:1921286 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 10[85000] via P2P/IPC +gpub031:1921207:1921286 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 10[85000] via P2P/IPC +gpub031:1921207:1921286 [3] NCCL INFO Connected all trees +gpub031:1921207:1921286 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1921207:1921286 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1921207:1921286 [3] NCCL INFO comm 0x9451c60 rank 11 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub052:1901668:1901668 [1] NCCL INFO cudaDriverVersion 12010 +gpub052:1901668:1901668 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.152<0> +gpub052:1901668:1901668 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub052:1901668:1901749 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.152<0> +gpub052:1901668:1901749 [1] NCCL INFO Using network IB +gpub052:1901668:1901749 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub052:1901668:1901749 [1] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/40/-1->37->36 +gpub052:1901668:1901749 [1] NCCL INFO Channel 00/0 : 37[46000] -> 38[85000] via P2P/IPC +gpub052:1901668:1901749 [1] NCCL INFO Channel 01/0 : 37[46000] -> 38[85000] via P2P/IPC +gpub052:1901668:1901749 [1] NCCL INFO Connected all rings +gpub052:1901668:1901749 [1] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [send] via NET/IB/0 +gpub052:1901668:1901749 [1] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [receive] via NET/IB/0 +gpub052:1901668:1901749 [1] NCCL INFO Channel 00/0 : 37[46000] -> 36[7000] via P2P/IPC +gpub052:1901668:1901749 [1] NCCL INFO Channel 01/0 : 37[46000] -> 36[7000] via P2P/IPC +gpub052:1901668:1901749 [1] NCCL INFO Connected all trees +gpub052:1901668:1901749 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub052:1901668:1901749 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub052:1901668:1901749 [1] NCCL INFO comm 0x50134230 rank 37 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub080:4113203:4113203 [0] NCCL INFO cudaDriverVersion 12010 +gpub080:4113203:4113203 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:4113203:4113203 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:4113203:4113290 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:4113203:4113290 [0] NCCL INFO Using network IB +gpub080:4113203:4113290 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub080:4113203:4113290 [0] NCCL INFO Trees [0] 53/-1/-1->52->57 [1] 53/48/-1->52->45 +gpub080:4113203:4113290 [0] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 00/0 : 52[7000] -> 53[46000] via P2P/IPC +gpub080:4113203:4113290 [0] NCCL INFO Channel 01/0 : 52[7000] -> 53[46000] via P2P/IPC +gpub080:4113203:4113290 [0] NCCL INFO Connected all rings +gpub080:4113203:4113290 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [receive] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [send] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [receive] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [send] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [receive] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [send] via NET/IB/0 +gpub080:4113203:4113290 [0] NCCL INFO Connected all trees +gpub080:4113203:4113290 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub080:4113203:4113290 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:4113203:4113290 [0] NCCL INFO comm 0xa21d7f0 rank 52 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub079:2657935:2657935 [3] NCCL INFO cudaDriverVersion 12010 +gpub079:2657935:2657935 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2657935:2657935 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2657935:2658008 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2657935:2658008 [3] NCCL INFO Using network IB +gpub079:2657935:2658008 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub079:2657935:2658008 [3] NCCL INFO Trees [0] -1/-1/-1->51->50 [1] -1/-1/-1->51->50 +gpub079:2657935:2658008 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0 +gpub079:2657935:2658008 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0 +gpub079:2657935:2658008 [3] NCCL INFO Connected all rings +gpub079:2657935:2658008 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 50[85000] via P2P/IPC +gpub079:2657935:2658008 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 50[85000] via P2P/IPC +gpub079:2657935:2658008 [3] NCCL INFO Connected all trees +gpub079:2657935:2658008 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2657935:2658008 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2657935:2658008 [3] NCCL INFO comm 0x4edd83d0 rank 51 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub032:3289605:3289605 [2] NCCL INFO cudaDriverVersion 12010 +gpub032:3289605:3289605 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3289605:3289605 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3289605:3289686 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3289605:3289686 [2] NCCL INFO Using network IB +gpub032:3289605:3289686 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub032:3289605:3289686 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 +gpub032:3289605:3289686 [2] NCCL INFO Channel 00/0 : 14[85000] -> 15[c7000] via P2P/IPC +gpub032:3289605:3289686 [2] NCCL INFO Channel 01/0 : 14[85000] -> 15[c7000] via P2P/IPC +gpub032:3289605:3289686 [2] NCCL INFO Connected all rings +gpub032:3289605:3289686 [2] NCCL INFO Channel 00/0 : 14[85000] -> 13[46000] via P2P/IPC +gpub032:3289605:3289686 [2] NCCL INFO Channel 01/0 : 14[85000] -> 13[46000] via P2P/IPC +gpub032:3289605:3289686 [2] NCCL INFO Connected all trees +gpub032:3289605:3289686 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3289605:3289686 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3289605:3289686 [2] NCCL INFO comm 0xb6f8bc90 rank 14 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub032:3289604:3289604 [1] NCCL INFO cudaDriverVersion 12010 +gpub032:3289604:3289604 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3289604:3289604 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3289604:3289685 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3289604:3289685 [1] NCCL INFO Using network IB +gpub032:3289604:3289685 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub032:3289604:3289685 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/20/-1->13->12 +gpub032:3289604:3289685 [1] NCCL INFO Channel 00/0 : 13[46000] -> 14[85000] via P2P/IPC +gpub032:3289604:3289685 [1] NCCL INFO Channel 01/0 : 13[46000] -> 14[85000] via P2P/IPC +gpub032:3289604:3289685 [1] NCCL INFO Connected all rings +gpub032:3289604:3289685 [1] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [send] via NET/IB/0 +gpub032:3289604:3289685 [1] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [receive] via NET/IB/0 +gpub032:3289604:3289685 [1] NCCL INFO Channel 00/0 : 13[46000] -> 12[7000] via P2P/IPC +gpub032:3289604:3289685 [1] NCCL INFO Channel 01/0 : 13[46000] -> 12[7000] via P2P/IPC +gpub032:3289604:3289685 [1] NCCL INFO Connected all trees +gpub032:3289604:3289685 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3289604:3289685 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3289604:3289685 [1] NCCL INFO comm 0x50c34690 rank 13 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub079:2657932:2657932 [0] NCCL INFO cudaDriverVersion 12010 +gpub079:2657932:2657932 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2657932:2657932 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2657932:2658009 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2657932:2658009 [0] NCCL INFO Using network IB +gpub079:2657932:2658009 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub079:2657932:2658009 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->52 +gpub079:2657932:2658009 [0] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 00/0 : 48[7000] -> 49[46000] via P2P/IPC +gpub079:2657932:2658009 [0] NCCL INFO Channel 01/0 : 48[7000] -> 49[46000] via P2P/IPC +gpub079:2657932:2658009 [0] NCCL INFO Connected all rings +gpub079:2657932:2658009 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [send] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [send] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [receive] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [send] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [receive] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [receive] via NET/IB/0 +gpub079:2657932:2658009 [0] NCCL INFO Connected all trees +gpub079:2657932:2658009 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2657932:2658009 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2657932:2658009 [0] NCCL INFO comm 0x8c890be0 rank 48 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub031:1921205:1921205 [1] NCCL INFO cudaDriverVersion 12010 +gpub031:1921205:1921205 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1921205:1921205 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1921205:1921287 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1921205:1921287 [1] NCCL INFO Using network IB +gpub031:1921205:1921287 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub031:1921205:1921287 [1] NCCL INFO Trees [0] 10/4/-1->9->8 [1] 10/-1/-1->9->8 +gpub031:1921205:1921287 [1] NCCL INFO Channel 00/0 : 9[46000] -> 10[85000] via P2P/IPC +gpub031:1921205:1921287 [1] NCCL INFO Channel 01/0 : 9[46000] -> 10[85000] via P2P/IPC +gpub031:1921205:1921287 [1] NCCL INFO Connected all rings +gpub031:1921205:1921287 [1] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [receive] via NET/IB/0 +gpub031:1921205:1921287 [1] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [send] via NET/IB/0 +gpub031:1921205:1921287 [1] NCCL INFO Channel 00/0 : 9[46000] -> 8[7000] via P2P/IPC +gpub031:1921205:1921287 [1] NCCL INFO Channel 01/0 : 9[46000] -> 8[7000] via P2P/IPC +gpub031:1921205:1921287 [1] NCCL INFO Connected all trees +gpub031:1921205:1921287 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1921205:1921287 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1921205:1921287 [1] NCCL INFO comm 0x92a3a80 rank 9 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub051:2913623:2913623 [0] NCCL INFO cudaDriverVersion 12010 +gpub051:2913623:2913623 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.151<0> +gpub051:2913623:2913623 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub051:2913623:2913706 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.151<0> +gpub051:2913623:2913706 [0] NCCL INFO Using network IB +gpub051:2913623:2913706 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub051:2913623:2913706 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->36 +gpub051:2913623:2913706 [0] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 00/0 : 32[7000] -> 33[46000] via P2P/IPC +gpub051:2913623:2913706 [0] NCCL INFO Channel 01/0 : 32[7000] -> 33[46000] via P2P/IPC +gpub051:2913623:2913706 [0] NCCL INFO Connected all rings +gpub051:2913623:2913706 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [send] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [send] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [receive] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [send] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [receive] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [receive] via NET/IB/0 +gpub051:2913623:2913706 [0] NCCL INFO Connected all trees +gpub051:2913623:2913706 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub051:2913623:2913706 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub051:2913623:2913706 [0] NCCL INFO comm 0x8dc3e980 rank 32 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub050:1879226:1879226 [1] NCCL INFO cudaDriverVersion 12010 +gpub050:1879226:1879226 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.150<0> +gpub050:1879226:1879226 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub050:1879226:1879305 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.150<0> +gpub050:1879226:1879305 [1] NCCL INFO Using network IB +gpub050:1879226:1879305 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub050:1879226:1879305 [1] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/44/-1->29->28 +gpub050:1879226:1879305 [1] NCCL INFO Channel 00/0 : 29[46000] -> 30[85000] via P2P/IPC +gpub050:1879226:1879305 [1] NCCL INFO Channel 01/0 : 29[46000] -> 30[85000] via P2P/IPC +gpub050:1879226:1879305 [1] NCCL INFO Connected all rings +gpub050:1879226:1879305 [1] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [send] via NET/IB/0 +gpub050:1879226:1879305 [1] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [receive] via NET/IB/0 +gpub080:4113206:4113206 [3] NCCL INFO cudaDriverVersion 12010 +gpub080:4113206:4113206 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:4113206:4113206 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:4113206:4113289 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:4113206:4113289 [3] NCCL INFO Using network IB +gpub080:4113206:4113289 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub080:4113206:4113289 [3] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] -1/-1/-1->55->54 +gpub080:4113206:4113289 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0 +gpub080:4113206:4113289 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0 +gpub080:4113206:4113289 [3] NCCL INFO Connected all rings +gpub080:4113206:4113289 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 54[85000] via P2P/IPC +gpub080:4113206:4113289 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 54[85000] via P2P/IPC +gpub050:1879226:1879305 [1] NCCL INFO Channel 00/0 : 29[46000] -> 28[7000] via P2P/IPC +gpub050:1879226:1879305 [1] NCCL INFO Channel 01/0 : 29[46000] -> 28[7000] via P2P/IPC +gpub050:1879226:1879305 [1] NCCL INFO Connected all trees +gpub050:1879226:1879305 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub050:1879226:1879305 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub050:1879226:1879305 [1] NCCL INFO comm 0x50792660 rank 29 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub080:4113206:4113289 [3] NCCL INFO Connected all trees +gpub080:4113206:4113289 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub080:4113206:4113289 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:4113206:4113289 [3] NCCL INFO comm 0x8c72c2a0 rank 55 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub050:1879227:1879227 [2] NCCL INFO cudaDriverVersion 12010 +gpub050:1879227:1879227 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.150<0> +gpub050:1879227:1879227 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub050:1879227:1879304 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.150<0> +gpub050:1879227:1879304 [2] NCCL INFO Using network IB +gpub050:1879227:1879304 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub050:1879227:1879304 [2] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 +gpub050:1879227:1879304 [2] NCCL INFO Channel 00/0 : 30[85000] -> 31[c7000] via P2P/IPC +gpub050:1879227:1879304 [2] NCCL INFO Channel 01/0 : 30[85000] -> 31[c7000] via P2P/IPC +gpub050:1879227:1879304 [2] NCCL INFO Connected all rings +gpub050:1879227:1879304 [2] NCCL INFO Channel 00/0 : 30[85000] -> 29[46000] via P2P/IPC +gpub050:1879227:1879304 [2] NCCL INFO Channel 01/0 : 30[85000] -> 29[46000] via P2P/IPC +gpub049:4064877:4064877 [3] NCCL INFO cudaDriverVersion 12010 +gpub049:4064877:4064877 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.149<0> +gpub049:4064877:4064877 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub049:4064877:4064941 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.149<0> +gpub049:4064877:4064941 [3] NCCL INFO Using network IB +gpub049:4064877:4064941 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub049:4064877:4064941 [3] NCCL INFO Trees [0] -1/-1/-1->27->26 [1] -1/-1/-1->27->26 +gpub049:4064877:4064941 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0 +gpub049:4064877:4064941 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0 +gpub049:4064877:4064941 [3] NCCL INFO Connected all rings +gpub049:4064877:4064941 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 26[85000] via P2P/IPC +gpub049:4064877:4064941 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 26[85000] via P2P/IPC +gpub050:1879227:1879304 [2] NCCL INFO Connected all trees +gpub050:1879227:1879304 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub050:1879227:1879304 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub050:1879227:1879304 [2] NCCL INFO comm 0x50baa200 rank 30 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub049:4064877:4064941 [3] NCCL INFO Connected all trees +gpub049:4064877:4064941 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub049:4064877:4064941 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub049:4064877:4064941 [3] NCCL INFO comm 0x4f5c00a0 rank 27 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub079:2657934:2657934 [2] NCCL INFO cudaDriverVersion 12010 +gpub079:2657934:2657934 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.179<0> +gpub079:2657934:2657934 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub079:2657934:2658007 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.179<0> +gpub079:2657934:2658007 [2] NCCL INFO Using network IB +gpub079:2657934:2658007 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub079:2657934:2658007 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/-1/-1->50->49 +gpub079:2657934:2658007 [2] NCCL INFO Channel 00/0 : 50[85000] -> 51[c7000] via P2P/IPC +gpub079:2657934:2658007 [2] NCCL INFO Channel 01/0 : 50[85000] -> 51[c7000] via P2P/IPC +gpub079:2657934:2658007 [2] NCCL INFO Connected all rings +gpub079:2657934:2658007 [2] NCCL INFO Channel 00/0 : 50[85000] -> 49[46000] via P2P/IPC +gpub079:2657934:2658007 [2] NCCL INFO Channel 01/0 : 50[85000] -> 49[46000] via P2P/IPC +gpub079:2657934:2658007 [2] NCCL INFO Connected all trees +gpub079:2657934:2658007 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub079:2657934:2658007 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub079:2657934:2658007 [2] NCCL INFO comm 0x505ec9b0 rank 50 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub015:879782:879782 [2] NCCL INFO cudaDriverVersion 12010 +gpub015:879782:879782 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:879782:879782 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:879782:879850 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:879782:879850 [2] NCCL INFO Using network IB +gpub015:879782:879850 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub015:879782:879850 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +gpub015:879782:879850 [2] NCCL INFO Channel 00/0 : 2[85000] -> 3[c7000] via P2P/IPC +gpub015:879782:879850 [2] NCCL INFO Channel 01/0 : 2[85000] -> 3[c7000] via P2P/IPC +gpub015:879782:879850 [2] NCCL INFO Connected all rings +gpub015:879782:879850 [2] NCCL INFO Channel 00/0 : 2[85000] -> 1[46000] via P2P/IPC +gpub015:879782:879850 [2] NCCL INFO Channel 01/0 : 2[85000] -> 1[46000] via P2P/IPC +gpub015:879782:879850 [2] NCCL INFO Connected all trees +gpub015:879782:879850 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:879782:879850 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:879782:879850 [2] NCCL INFO comm 0x502ad7c0 rank 2 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub036:1870498:1870498 [2] NCCL INFO cudaDriverVersion 12010 +gpub036:1870498:1870498 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.136<0> +gpub036:1870498:1870498 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub036:1870498:1870579 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.136<0> +gpub036:1870498:1870579 [2] NCCL INFO Using network IB +gpub036:1870498:1870579 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub036:1870498:1870579 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/-1/-1->18->17 +gpub036:1870498:1870579 [2] NCCL INFO Channel 00/0 : 18[85000] -> 19[c7000] via P2P/IPC +gpub036:1870498:1870579 [2] NCCL INFO Channel 01/0 : 18[85000] -> 19[c7000] via P2P/IPC +gpub036:1870498:1870579 [2] NCCL INFO Connected all rings +gpub036:1870498:1870579 [2] NCCL INFO Channel 00/0 : 18[85000] -> 17[46000] via P2P/IPC +gpub036:1870498:1870579 [2] NCCL INFO Channel 01/0 : 18[85000] -> 17[46000] via P2P/IPC +gpub036:1870498:1870579 [2] NCCL INFO Connected all trees +gpub036:1870498:1870579 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub036:1870498:1870579 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub036:1870498:1870579 [2] NCCL INFO comm 0x50c66a10 rank 18 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub031:1921206:1921206 [2] NCCL INFO cudaDriverVersion 12010 +gpub031:1921206:1921206 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1921206:1921206 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1921206:1921288 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1921206:1921288 [2] NCCL INFO Using network IB +gpub031:1921206:1921288 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub031:1921206:1921288 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 +gpub031:1921206:1921288 [2] NCCL INFO Channel 00/0 : 10[85000] -> 11[c7000] via P2P/IPC +gpub031:1921206:1921288 [2] NCCL INFO Channel 01/0 : 10[85000] -> 11[c7000] via P2P/IPC +gpub031:1921206:1921288 [2] NCCL INFO Connected all rings +gpub031:1921206:1921288 [2] NCCL INFO Channel 00/0 : 10[85000] -> 9[46000] via P2P/IPC +gpub031:1921206:1921288 [2] NCCL INFO Channel 01/0 : 10[85000] -> 9[46000] via P2P/IPC +gpub031:1921206:1921288 [2] NCCL INFO Connected all trees +gpub031:1921206:1921288 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub031:1921206:1921288 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1921206:1921288 [2] NCCL INFO comm 0xc2e65190 rank 10 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub082:1518447:1518447 [2] NCCL INFO cudaDriverVersion 12010 +gpub082:1518447:1518447 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.182<0> +gpub082:1518447:1518447 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub082:1518447:1518526 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.182<0> +gpub082:1518447:1518526 [2] NCCL INFO Using network IB +gpub082:1518447:1518526 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub082:1518447:1518526 [2] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 +gpub082:1518447:1518526 [2] NCCL INFO Channel 00/0 : 62[85000] -> 63[c7000] via P2P/IPC +gpub082:1518447:1518526 [2] NCCL INFO Channel 01/0 : 62[85000] -> 63[c7000] via P2P/IPC +gpub082:1518447:1518526 [2] NCCL INFO Connected all rings +gpub082:1518447:1518526 [2] NCCL INFO Channel 00/0 : 62[85000] -> 61[46000] via P2P/IPC +gpub082:1518447:1518526 [2] NCCL INFO Channel 01/0 : 62[85000] -> 61[46000] via P2P/IPC +gpub082:1518447:1518526 [2] NCCL INFO Connected all trees +gpub082:1518447:1518526 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub082:1518447:1518526 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub082:1518447:1518526 [2] NCCL INFO comm 0xb6376a90 rank 62 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub026:2433084:2433084 [0] NCCL INFO cudaDriverVersion 12010 +gpub026:2433084:2433084 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.126<0> +gpub026:2433084:2433084 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub026:2433084:2433166 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.126<0> +gpub026:2433084:2433166 [0] NCCL INFO Using network IB +gpub026:2433084:2433166 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub026:2433084:2433166 [0] NCCL INFO Trees [0] 5/-1/-1->4->9 [1] 5/0/-1->4->12 +gpub026:2433084:2433166 [0] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 00/0 : 4[7000] -> 5[46000] via P2P/IPC +gpub026:2433084:2433166 [0] NCCL INFO Channel 01/0 : 4[7000] -> 5[46000] via P2P/IPC +gpub026:2433084:2433166 [0] NCCL INFO Connected all rings +gpub026:2433084:2433166 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [receive] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [send] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [send] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [receive] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [receive] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [send] via NET/IB/0 +gpub026:2433084:2433166 [0] NCCL INFO Connected all trees +gpub026:2433084:2433166 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub026:2433084:2433166 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub026:2433084:2433166 [0] NCCL INFO comm 0x4fe36690 rank 4 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub049:4064874:4064874 [0] NCCL INFO cudaDriverVersion 12010 +gpub049:4064874:4064874 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.149<0> +gpub049:4064874:4064874 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub049:4064874:4064942 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.149<0> +gpub049:4064874:4064942 [0] NCCL INFO Using network IB +gpub049:4064874:4064942 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub049:4064874:4064942 [0] NCCL INFO Trees [0] 25/28/-1->24->16 [1] 25/-1/-1->24->21 +gpub049:4064874:4064942 [0] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 00/0 : 24[7000] -> 25[46000] via P2P/IPC +gpub049:4064874:4064942 [0] NCCL INFO Channel 01/0 : 24[7000] -> 25[46000] via P2P/IPC +gpub049:4064874:4064942 [0] NCCL INFO Connected all rings +gpub049:4064874:4064942 [0] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [receive] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [send] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [receive] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [send] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [receive] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [send] via NET/IB/0 +gpub049:4064874:4064942 [0] NCCL INFO Connected all trees +gpub049:4064874:4064942 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub049:4064874:4064942 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub049:4064874:4064942 [0] NCCL INFO comm 0x500f4c60 rank 24 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub082:1518446:1518446 [1] NCCL INFO cudaDriverVersion 12010 +gpub082:1518446:1518446 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.182<0> +gpub082:1518446:1518446 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub082:1518446:1518525 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.182<0> +gpub082:1518446:1518525 [1] NCCL INFO Using network IB +gpub082:1518446:1518525 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub082:1518446:1518525 [1] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60 +gpub082:1518446:1518525 [1] NCCL INFO Channel 00/0 : 61[46000] -> 62[85000] via P2P/IPC +gpub082:1518446:1518525 [1] NCCL INFO Channel 01/0 : 61[46000] -> 62[85000] via P2P/IPC +gpub082:1518446:1518525 [1] NCCL INFO Connected all rings +gpub082:1518446:1518525 [1] NCCL INFO Channel 00/0 : 61[46000] -> 60[7000] via P2P/IPC +gpub082:1518446:1518525 [1] NCCL INFO Channel 01/0 : 61[46000] -> 60[7000] via P2P/IPC +gpub082:1518446:1518525 [1] NCCL INFO Connected all trees +gpub082:1518446:1518525 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub082:1518446:1518525 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub082:1518446:1518525 [1] NCCL INFO comm 0xb6caaae0 rank 61 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub036:1870496:1870496 [0] NCCL INFO cudaDriverVersion 12010 +gpub036:1870496:1870496 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.136<0> +gpub036:1870496:1870496 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub036:1870496:1870578 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.136<0> +gpub036:1870496:1870578 [0] NCCL INFO Using network IB +gpub036:1870496:1870578 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub036:1870496:1870578 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->20 +gpub036:1870496:1870578 [0] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 00/0 : 16[7000] -> 17[46000] via P2P/IPC +gpub036:1870496:1870578 [0] NCCL INFO Channel 01/0 : 16[7000] -> 17[46000] via P2P/IPC +gpub036:1870496:1870578 [0] NCCL INFO Connected all rings +gpub036:1870496:1870578 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [send] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [send] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [send] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [receive] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [receive] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [receive] via NET/IB/0 +gpub036:1870496:1870578 [0] NCCL INFO Connected all trees +gpub036:1870496:1870578 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub036:1870496:1870578 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub036:1870496:1870578 [0] NCCL INFO comm 0xad17bd0 rank 16 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub081:2742227:2742227 [0] NCCL INFO cudaDriverVersion 12010 +gpub081:2742227:2742227 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.181<0> +gpub081:2742227:2742227 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub081:2742227:2742317 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.181<0> +gpub081:2742227:2742317 [0] NCCL INFO Using network IB +gpub081:2742227:2742317 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub081:2742227:2742317 [0] NCCL INFO Trees [0] 57/60/-1->56->48 [1] 57/-1/-1->56->53 +gpub081:2742227:2742317 [0] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 00/0 : 56[7000] -> 57[46000] via P2P/IPC +gpub081:2742227:2742317 [0] NCCL INFO Channel 01/0 : 56[7000] -> 57[46000] via P2P/IPC +gpub081:2742227:2742317 [0] NCCL INFO Connected all rings +gpub026:2433087:2433087 [3] NCCL INFO cudaDriverVersion 12010 +gpub026:2433087:2433087 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.126<0> +gpub026:2433087:2433087 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub026:2433087:2433163 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.126<0> +gpub026:2433087:2433163 [3] NCCL INFO Using network IB +gpub026:2433087:2433163 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub026:2433087:2433163 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +gpub026:2433087:2433163 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0 +gpub026:2433087:2433163 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0 +gpub026:2433087:2433163 [3] NCCL INFO Connected all rings +gpub026:2433087:2433163 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 6[85000] via P2P/IPC +gpub026:2433087:2433163 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 6[85000] via P2P/IPC +gpub081:2742227:2742317 [0] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [receive] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [send] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [receive] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [send] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [receive] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [send] via NET/IB/0 +gpub081:2742227:2742317 [0] NCCL INFO Connected all trees +gpub081:2742227:2742317 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub081:2742227:2742317 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub081:2742227:2742317 [0] NCCL INFO comm 0x518b4950 rank 56 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub026:2433087:2433163 [3] NCCL INFO Connected all trees +gpub026:2433087:2433163 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub026:2433087:2433163 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub026:2433087:2433163 [3] NCCL INFO comm 0x50347080 rank 7 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub051:2913624:2913624 [1] NCCL INFO cudaDriverVersion 12010 +gpub051:2913624:2913624 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.151<0> +gpub051:2913624:2913624 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub051:2913624:2913707 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.151<0> +gpub051:2913624:2913707 [1] NCCL INFO Using network IB +gpub051:2913624:2913707 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub051:2913624:2913707 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] 34/-1/-1->33->32 +gpub051:2913624:2913707 [1] NCCL INFO Channel 00/0 : 33[46000] -> 34[85000] via P2P/IPC +gpub051:2913624:2913707 [1] NCCL INFO Channel 01/0 : 33[46000] -> 34[85000] via P2P/IPC +gpub051:2913624:2913707 [1] NCCL INFO Connected all rings +gpub051:2913624:2913707 [1] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [receive] via NET/IB/0 +gpub051:2913624:2913707 [1] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [send] via NET/IB/0 +gpub051:2913624:2913707 [1] NCCL INFO Channel 00/0 : 33[46000] -> 32[7000] via P2P/IPC +gpub051:2913624:2913707 [1] NCCL INFO Channel 01/0 : 33[46000] -> 32[7000] via P2P/IPC +gpub051:2913624:2913707 [1] NCCL INFO Connected all trees +gpub051:2913624:2913707 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub051:2913624:2913707 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub051:2913624:2913707 [1] NCCL INFO comm 0xbb329750 rank 33 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub052:1901670:1901670 [3] NCCL INFO cudaDriverVersion 12010 +gpub052:1901670:1901670 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.152<0> +gpub052:1901670:1901670 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub052:1901670:1901750 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.152<0> +gpub052:1901670:1901750 [3] NCCL INFO Using network IB +gpub052:1901670:1901750 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub052:1901670:1901750 [3] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] -1/-1/-1->39->38 +gpub052:1901670:1901750 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0 +gpub052:1901670:1901750 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0 +gpub052:1901670:1901750 [3] NCCL INFO Connected all rings +gpub052:1901670:1901750 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 38[85000] via P2P/IPC +gpub052:1901670:1901750 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 38[85000] via P2P/IPC +gpub052:1901670:1901750 [3] NCCL INFO Connected all trees +gpub052:1901670:1901750 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub052:1901670:1901750 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub052:1901670:1901750 [3] NCCL INFO comm 0xb6ced700 rank 39 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub036:1870497:1870497 [1] NCCL INFO cudaDriverVersion 12010 +gpub036:1870497:1870497 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.136<0> +gpub036:1870497:1870497 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub036:1870497:1870580 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.136<0> +gpub036:1870497:1870580 [1] NCCL INFO Using network IB +gpub036:1870497:1870580 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub036:1870497:1870580 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] 18/-1/-1->17->16 +gpub036:1870497:1870580 [1] NCCL INFO Channel 00/0 : 17[46000] -> 18[85000] via P2P/IPC +gpub036:1870497:1870580 [1] NCCL INFO Channel 01/0 : 17[46000] -> 18[85000] via P2P/IPC +gpub036:1870497:1870580 [1] NCCL INFO Connected all rings +gpub036:1870497:1870580 [1] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [receive] via NET/IB/0 +gpub036:1870497:1870580 [1] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [send] via NET/IB/0 +gpub036:1870497:1870580 [1] NCCL INFO Channel 00/0 : 17[46000] -> 16[7000] via P2P/IPC +gpub036:1870497:1870580 [1] NCCL INFO Channel 01/0 : 17[46000] -> 16[7000] via P2P/IPC +gpub036:1870497:1870580 [1] NCCL INFO Connected all trees +gpub036:1870497:1870580 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub036:1870497:1870580 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub036:1870497:1870580 [1] NCCL INFO comm 0x4fcaadc0 rank 17 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub078:4170391:4170391 [2] NCCL INFO cudaDriverVersion 12010 +gpub078:4170391:4170391 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.178<0> +gpub078:4170391:4170391 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub078:4170391:4170469 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.178<0> +gpub078:4170391:4170469 [2] NCCL INFO Using network IB +gpub078:4170391:4170469 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub078:4170391:4170469 [2] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 +gpub078:4170391:4170469 [2] NCCL INFO Channel 00/0 : 46[85000] -> 47[c7000] via P2P/IPC +gpub078:4170391:4170469 [2] NCCL INFO Channel 01/0 : 46[85000] -> 47[c7000] via P2P/IPC +gpub078:4170391:4170469 [2] NCCL INFO Connected all rings +gpub078:4170391:4170469 [2] NCCL INFO Channel 00/0 : 46[85000] -> 45[46000] via P2P/IPC +gpub078:4170391:4170469 [2] NCCL INFO Channel 01/0 : 46[85000] -> 45[46000] via P2P/IPC +gpub078:4170391:4170469 [2] NCCL INFO Connected all trees +gpub078:4170391:4170469 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub078:4170391:4170469 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub078:4170391:4170469 [2] NCCL INFO comm 0x5187a990 rank 46 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub082:1518448:1518448 [3] NCCL INFO cudaDriverVersion 12010 +gpub082:1518448:1518448 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.182<0> +gpub082:1518448:1518448 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub082:1518448:1518524 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.182<0> +gpub082:1518448:1518524 [3] NCCL INFO Using network IB +gpub082:1518448:1518524 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub082:1518448:1518524 [3] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] -1/-1/-1->63->62 +gpub082:1518448:1518524 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [send] via NET/IB/0 +gpub082:1518448:1518524 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [send] via NET/IB/0 +gpub082:1518448:1518524 [3] NCCL INFO Connected all rings +gpub082:1518448:1518524 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 62[85000] via P2P/IPC +gpub082:1518448:1518524 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 62[85000] via P2P/IPC +gpub082:1518448:1518524 [3] NCCL INFO Connected all trees +gpub082:1518448:1518524 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub082:1518448:1518524 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub082:1518448:1518524 [3] NCCL INFO comm 0x8c5b6f90 rank 63 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub053:1664487:1664487 [1] NCCL INFO cudaDriverVersion 12010 +gpub053:1664487:1664487 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.153<0> +gpub053:1664487:1664487 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub053:1664487:1664558 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.153<0> +gpub053:1664487:1664558 [1] NCCL INFO Using network IB +gpub053:1664487:1664558 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub053:1664487:1664558 [1] NCCL INFO Trees [0] 42/36/-1->41->40 [1] 42/-1/-1->41->40 +gpub053:1664487:1664558 [1] NCCL INFO Channel 00/0 : 41[46000] -> 42[85000] via P2P/IPC +gpub053:1664487:1664558 [1] NCCL INFO Channel 01/0 : 41[46000] -> 42[85000] via P2P/IPC +gpub053:1664487:1664558 [1] NCCL INFO Connected all rings +gpub053:1664487:1664558 [1] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [receive] via NET/IB/0 +gpub053:1664487:1664558 [1] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [send] via NET/IB/0 +gpub053:1664487:1664558 [1] NCCL INFO Channel 00/0 : 41[46000] -> 40[7000] via P2P/IPC +gpub053:1664487:1664558 [1] NCCL INFO Channel 01/0 : 41[46000] -> 40[7000] via P2P/IPC +gpub053:1664487:1664558 [1] NCCL INFO Connected all trees +gpub053:1664487:1664558 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub053:1664487:1664558 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub053:1664487:1664558 [1] NCCL INFO comm 0x506110d0 rank 41 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub026:2433086:2433086 [2] NCCL INFO cudaDriverVersion 12010 +gpub026:2433086:2433086 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.126<0> +gpub026:2433086:2433086 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub026:2433086:2433164 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.126<0> +gpub026:2433086:2433164 [2] NCCL INFO Using network IB +gpub026:2433086:2433164 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub026:2433086:2433164 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +gpub026:2433086:2433164 [2] NCCL INFO Channel 00/0 : 6[85000] -> 7[c7000] via P2P/IPC +gpub026:2433086:2433164 [2] NCCL INFO Channel 01/0 : 6[85000] -> 7[c7000] via P2P/IPC +gpub026:2433086:2433164 [2] NCCL INFO Connected all rings +gpub026:2433086:2433164 [2] NCCL INFO Channel 00/0 : 6[85000] -> 5[46000] via P2P/IPC +gpub026:2433086:2433164 [2] NCCL INFO Channel 01/0 : 6[85000] -> 5[46000] via P2P/IPC +gpub026:2433086:2433164 [2] NCCL INFO Connected all trees +gpub026:2433086:2433164 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub026:2433086:2433164 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub026:2433086:2433164 [2] NCCL INFO comm 0xc27df910 rank 6 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub080:4113205:4113205 [2] NCCL INFO cudaDriverVersion 12010 +gpub080:4113205:4113205 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:4113205:4113205 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:4113205:4113288 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:4113205:4113288 [2] NCCL INFO Using network IB +gpub080:4113205:4113288 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub080:4113205:4113288 [2] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 +gpub080:4113205:4113288 [2] NCCL INFO Channel 00/0 : 54[85000] -> 55[c7000] via P2P/IPC +gpub080:4113205:4113288 [2] NCCL INFO Channel 01/0 : 54[85000] -> 55[c7000] via P2P/IPC +gpub080:4113205:4113288 [2] NCCL INFO Connected all rings +gpub080:4113205:4113288 [2] NCCL INFO Channel 00/0 : 54[85000] -> 53[46000] via P2P/IPC +gpub080:4113205:4113288 [2] NCCL INFO Channel 01/0 : 54[85000] -> 53[46000] via P2P/IPC +gpub080:4113205:4113288 [2] NCCL INFO Connected all trees +gpub080:4113205:4113288 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub080:4113205:4113288 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:4113205:4113288 [2] NCCL INFO comm 0x50af0e00 rank 54 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub050:1879228:1879228 [3] NCCL INFO cudaDriverVersion 12010 +gpub050:1879228:1879228 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.150<0> +gpub050:1879228:1879228 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub050:1879228:1879302 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.150<0> +gpub050:1879228:1879302 [3] NCCL INFO Using network IB +gpub050:1879228:1879302 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub050:1879228:1879302 [3] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] -1/-1/-1->31->30 +gpub050:1879228:1879302 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0 +gpub050:1879228:1879302 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0 +gpub050:1879228:1879302 [3] NCCL INFO Connected all rings +gpub050:1879228:1879302 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 30[85000] via P2P/IPC +gpub050:1879228:1879302 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 30[85000] via P2P/IPC +gpub050:1879228:1879302 [3] NCCL INFO Connected all trees +gpub050:1879228:1879302 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub050:1879228:1879302 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub050:1879228:1879302 [3] NCCL INFO comm 0x5177dae0 rank 31 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub081:2742230:2742230 [3] NCCL INFO cudaDriverVersion 12010 +gpub081:2742230:2742230 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.181<0> +gpub081:2742230:2742230 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub081:2742230:2742314 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.181<0> +gpub081:2742230:2742314 [3] NCCL INFO Using network IB +gpub081:2742230:2742314 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub081:2742230:2742314 [3] NCCL INFO Trees [0] -1/-1/-1->59->58 [1] -1/-1/-1->59->58 +gpub081:2742230:2742314 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0 +gpub081:2742230:2742314 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0 +gpub081:2742230:2742314 [3] NCCL INFO Connected all rings +gpub081:2742230:2742314 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 58[85000] via P2P/IPC +gpub081:2742230:2742314 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 58[85000] via P2P/IPC +gpub081:2742230:2742314 [3] NCCL INFO Connected all trees +gpub081:2742230:2742314 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub081:2742230:2742314 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub081:2742230:2742314 [3] NCCL INFO comm 0xba992be0 rank 59 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub053:1664488:1664488 [2] NCCL INFO cudaDriverVersion 12010 +gpub053:1664488:1664488 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.153<0> +gpub053:1664488:1664488 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub053:1664488:1664560 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.153<0> +gpub053:1664488:1664560 [2] NCCL INFO Using network IB +gpub053:1664488:1664560 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub053:1664488:1664560 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->41 +gpub053:1664488:1664560 [2] NCCL INFO Channel 00/0 : 42[85000] -> 43[c7000] via P2P/IPC +gpub053:1664488:1664560 [2] NCCL INFO Channel 01/0 : 42[85000] -> 43[c7000] via P2P/IPC +gpub053:1664488:1664560 [2] NCCL INFO Connected all rings +gpub053:1664488:1664560 [2] NCCL INFO Channel 00/0 : 42[85000] -> 41[46000] via P2P/IPC +gpub053:1664488:1664560 [2] NCCL INFO Channel 01/0 : 42[85000] -> 41[46000] via P2P/IPC +gpub053:1664488:1664560 [2] NCCL INFO Connected all trees +gpub053:1664488:1664560 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub053:1664488:1664560 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub053:1664488:1664560 [2] NCCL INFO comm 0xe3027a0 rank 42 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub078:4170390:4170390 [1] NCCL INFO cudaDriverVersion 12010 +gpub078:4170390:4170390 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.178<0> +gpub078:4170390:4170390 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub078:4170390:4170468 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.178<0> +gpub078:4170390:4170468 [1] NCCL INFO Using network IB +gpub078:4170390:4170468 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub078:4170390:4170468 [1] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/52/-1->45->44 +gpub078:4170390:4170468 [1] NCCL INFO Channel 00/0 : 45[46000] -> 46[85000] via P2P/IPC +gpub078:4170390:4170468 [1] NCCL INFO Channel 01/0 : 45[46000] -> 46[85000] via P2P/IPC +gpub078:4170390:4170468 [1] NCCL INFO Connected all rings +gpub078:4170390:4170468 [1] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [send] via NET/IB/0 +gpub078:4170390:4170468 [1] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [receive] via NET/IB/0 +gpub078:4170390:4170468 [1] NCCL INFO Channel 00/0 : 45[46000] -> 44[7000] via P2P/IPC +gpub078:4170390:4170468 [1] NCCL INFO Channel 01/0 : 45[46000] -> 44[7000] via P2P/IPC +gpub078:4170390:4170468 [1] NCCL INFO Connected all trees +gpub078:4170390:4170468 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub078:4170390:4170468 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub078:4170390:4170468 [1] NCCL INFO comm 0x1d97f440 rank 45 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub078:4170389:4170389 [0] NCCL INFO cudaDriverVersion 12010 +gpub078:4170389:4170389 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.178<0> +gpub078:4170389:4170389 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub078:4170389:4170470 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.178<0> +gpub078:4170389:4170470 [0] NCCL INFO Using network IB +gpub078:4170389:4170470 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub078:4170389:4170470 [0] NCCL INFO Trees [0] 45/-1/-1->44->40 [1] 45/36/-1->44->29 +gpub078:4170389:4170470 [0] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 00/0 : 44[7000] -> 45[46000] via P2P/IPC +gpub078:4170389:4170470 [0] NCCL INFO Channel 01/0 : 44[7000] -> 45[46000] via P2P/IPC +gpub078:4170389:4170470 [0] NCCL INFO Connected all rings +gpub078:4170389:4170470 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [receive] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [receive] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [receive] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [send] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [send] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [send] via NET/IB/0 +gpub078:4170389:4170470 [0] NCCL INFO Connected all trees +gpub078:4170389:4170470 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub078:4170389:4170470 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub078:4170389:4170470 [0] NCCL INFO comm 0x4f656710 rank 44 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub078:4170392:4170392 [3] NCCL INFO cudaDriverVersion 12010 +gpub078:4170392:4170392 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.178<0> +gpub078:4170392:4170392 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub078:4170392:4170471 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.178<0> +gpub078:4170392:4170471 [3] NCCL INFO Using network IB +gpub078:4170392:4170471 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub078:4170392:4170471 [3] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] -1/-1/-1->47->46 +gpub078:4170392:4170471 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0 +gpub078:4170392:4170471 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0 +gpub078:4170392:4170471 [3] NCCL INFO Connected all rings +gpub078:4170392:4170471 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 46[85000] via P2P/IPC +gpub078:4170392:4170471 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 46[85000] via P2P/IPC +gpub078:4170392:4170471 [3] NCCL INFO Connected all trees +gpub078:4170392:4170471 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub078:4170392:4170471 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub078:4170392:4170471 [3] NCCL INFO comm 0x4f67f390 rank 47 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub015:879780:879852 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:879780:879852 [0] NCCL INFO Using network IB +gpub015:879780:879852 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub015:879780:879852 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpub015:879780:879852 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpub015:879780:879852 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->4 +gpub015:879780:879852 [0] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [receive] via NET/IB/0 +gpub015:879780:879852 [0] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [receive] via NET/IB/0 +gpub015:879780:879852 [0] NCCL INFO Channel 00/0 : 0[7000] -> 1[46000] via P2P/IPC +gpub015:879780:879852 [0] NCCL INFO Channel 01/0 : 0[7000] -> 1[46000] via P2P/IPC +gpub015:879780:879852 [0] NCCL INFO Connected all rings +gpub015:879780:879852 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [send] via NET/IB/0 +gpub015:879780:879852 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [receive] via NET/IB/0 +gpub015:879780:879852 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [send] via NET/IB/0 +gpub015:879780:879852 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [receive] via NET/IB/0 +gpub015:879780:879852 [0] NCCL INFO Connected all trees +gpub015:879780:879852 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:879780:879852 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:879780:879852 [0] NCCL INFO comm 0x51871d20 rank 0 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub015:879781:879781 [1] NCCL INFO cudaDriverVersion 12010 +gpub015:879781:879781 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:879781:879781 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:879781:879853 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:879781:879853 [1] NCCL INFO Using network IB +gpub015:879781:879853 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub015:879781:879853 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +gpub015:879781:879853 [1] NCCL INFO Channel 00/0 : 1[46000] -> 2[85000] via P2P/IPC +gpub015:879781:879853 [1] NCCL INFO Channel 01/0 : 1[46000] -> 2[85000] via P2P/IPC +gpub015:879781:879853 [1] NCCL INFO Connected all rings +gpub015:879781:879853 [1] NCCL INFO Channel 00/0 : 1[46000] -> 0[7000] via P2P/IPC +gpub015:879781:879853 [1] NCCL INFO Channel 01/0 : 1[46000] -> 0[7000] via P2P/IPC +gpub015:879781:879853 [1] NCCL INFO Connected all trees +gpub015:879781:879853 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub015:879781:879853 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:879781:879853 [1] NCCL INFO comm 0x8d09c1b0 rank 1 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub053:1664486:1664486 [0] NCCL INFO cudaDriverVersion 12010 +gpub053:1664486:1664486 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.153<0> +gpub053:1664486:1664486 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub053:1664486:1664557 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.153<0> +gpub053:1664486:1664557 [0] NCCL INFO Using network IB +gpub053:1664486:1664557 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub053:1664486:1664557 [0] NCCL INFO Trees [0] 41/44/-1->40->49 [1] 41/-1/-1->40->37 +gpub053:1664486:1664557 [0] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 00/0 : 40[7000] -> 41[46000] via P2P/IPC +gpub053:1664486:1664557 [0] NCCL INFO Channel 01/0 : 40[7000] -> 41[46000] via P2P/IPC +gpub053:1664486:1664557 [0] NCCL INFO Connected all rings +gpub053:1664486:1664557 [0] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [receive] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [send] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [send] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [receive] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [receive] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [send] via NET/IB/0 +gpub053:1664486:1664557 [0] NCCL INFO Connected all trees +gpub053:1664486:1664557 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub053:1664486:1664557 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub053:1664486:1664557 [0] NCCL INFO comm 0x4f7ecd60 rank 40 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub049:4064876:4064876 [2] NCCL INFO cudaDriverVersion 12010 +gpub049:4064876:4064876 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.149<0> +gpub049:4064876:4064876 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub049:4064876:4064939 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.149<0> +gpub049:4064876:4064939 [2] NCCL INFO Using network IB +gpub049:4064876:4064939 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub049:4064876:4064939 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->25 +gpub049:4064876:4064939 [2] NCCL INFO Channel 00/0 : 26[85000] -> 27[c7000] via P2P/IPC +gpub049:4064876:4064939 [2] NCCL INFO Channel 01/0 : 26[85000] -> 27[c7000] via P2P/IPC +gpub049:4064876:4064939 [2] NCCL INFO Connected all rings +gpub049:4064876:4064939 [2] NCCL INFO Channel 00/0 : 26[85000] -> 25[46000] via P2P/IPC +gpub049:4064876:4064939 [2] NCCL INFO Channel 01/0 : 26[85000] -> 25[46000] via P2P/IPC +gpub049:4064876:4064939 [2] NCCL INFO Connected all trees +gpub049:4064876:4064939 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub049:4064876:4064939 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub049:4064876:4064939 [2] NCCL INFO comm 0xb89777d0 rank 26 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub032:3289603:3289603 [0] NCCL INFO cudaDriverVersion 12010 +gpub032:3289603:3289603 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:3289603:3289603 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:3289603:3289688 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:3289603:3289688 [0] NCCL INFO Using network IB +gpub032:3289603:3289688 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub032:3289603:3289688 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] 13/4/-1->12->28 +gpub032:3289603:3289688 [0] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 00/0 : 12[7000] -> 13[46000] via P2P/IPC +gpub032:3289603:3289688 [0] NCCL INFO Channel 01/0 : 12[7000] -> 13[46000] via P2P/IPC +gpub032:3289603:3289688 [0] NCCL INFO Connected all rings +gpub032:3289603:3289688 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [receive] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [receive] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [send] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [receive] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [send] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [send] via NET/IB/0 +gpub032:3289603:3289688 [0] NCCL INFO Connected all trees +gpub032:3289603:3289688 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub032:3289603:3289688 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:3289603:3289688 [0] NCCL INFO comm 0x9f95b40 rank 12 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub081:2742228:2742228 [1] NCCL INFO cudaDriverVersion 12010 +gpub081:2742228:2742228 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.181<0> +gpub081:2742228:2742228 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub081:2742228:2742316 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.181<0> +gpub081:2742228:2742316 [1] NCCL INFO Using network IB +gpub081:2742228:2742316 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub081:2742228:2742316 [1] NCCL INFO Trees [0] 58/52/-1->57->56 [1] 58/-1/-1->57->56 +gpub081:2742228:2742316 [1] NCCL INFO Channel 00/0 : 57[46000] -> 58[85000] via P2P/IPC +gpub081:2742228:2742316 [1] NCCL INFO Channel 01/0 : 57[46000] -> 58[85000] via P2P/IPC +gpub081:2742228:2742316 [1] NCCL INFO Connected all rings +gpub081:2742228:2742316 [1] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [receive] via NET/IB/0 +gpub081:2742228:2742316 [1] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [send] via NET/IB/0 +gpub081:2742228:2742316 [1] NCCL INFO Channel 00/0 : 57[46000] -> 56[7000] via P2P/IPC +gpub081:2742228:2742316 [1] NCCL INFO Channel 01/0 : 57[46000] -> 56[7000] via P2P/IPC +gpub081:2742228:2742316 [1] NCCL INFO Connected all trees +gpub081:2742228:2742316 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub081:2742228:2742316 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub081:2742228:2742316 [1] NCCL INFO comm 0xb78a1250 rank 57 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub037:1522723:1522723 [1] NCCL INFO cudaDriverVersion 12010 +gpub037:1522723:1522723 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1522723:1522723 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1522723:1522803 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1522723:1522803 [1] NCCL INFO Using network IB +gpub037:1522723:1522803 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub037:1522723:1522803 [1] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/24/-1->21->20 +gpub037:1522723:1522803 [1] NCCL INFO Channel 00/0 : 21[46000] -> 22[85000] via P2P/IPC +gpub037:1522723:1522803 [1] NCCL INFO Channel 01/0 : 21[46000] -> 22[85000] via P2P/IPC +gpub037:1522723:1522803 [1] NCCL INFO Connected all rings +gpub037:1522723:1522803 [1] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [send] via NET/IB/0 +gpub037:1522723:1522803 [1] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [receive] via NET/IB/0 +gpub037:1522723:1522803 [1] NCCL INFO Channel 00/0 : 21[46000] -> 20[7000] via P2P/IPC +gpub037:1522723:1522803 [1] NCCL INFO Channel 01/0 : 21[46000] -> 20[7000] via P2P/IPC +gpub037:1522723:1522803 [1] NCCL INFO Connected all trees +gpub037:1522723:1522803 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub037:1522723:1522803 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1522723:1522803 [1] NCCL INFO comm 0xba5d23a0 rank 21 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub050:1879225:1879225 [0] NCCL INFO cudaDriverVersion 12010 +gpub050:1879225:1879225 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.150<0> +gpub050:1879225:1879225 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub050:1879225:1879303 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.150<0> +gpub050:1879225:1879303 [0] NCCL INFO Using network IB +gpub050:1879225:1879303 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub050:1879225:1879303 [0] NCCL INFO Trees [0] 29/-1/-1->28->24 [1] 29/12/-1->28->60 +gpub050:1879225:1879303 [0] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 00/0 : 28[7000] -> 29[46000] via P2P/IPC +gpub050:1879225:1879303 [0] NCCL INFO Channel 01/0 : 28[7000] -> 29[46000] via P2P/IPC +gpub050:1879225:1879303 [0] NCCL INFO Connected all rings +gpub050:1879225:1879303 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [receive] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [receive] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [receive] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [send] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [send] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [send] via NET/IB/0 +gpub050:1879225:1879303 [0] NCCL INFO Connected all trees +gpub050:1879225:1879303 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub050:1879225:1879303 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub050:1879225:1879303 [0] NCCL INFO comm 0xa81f9440 rank 28 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub052:1901669:1901669 [2] NCCL INFO cudaDriverVersion 12010 +gpub052:1901669:1901669 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.152<0> +gpub052:1901669:1901669 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub052:1901669:1901751 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.152<0> +gpub052:1901669:1901751 [2] NCCL INFO Using network IB +gpub052:1901669:1901751 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub052:1901669:1901751 [2] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 +gpub052:1901669:1901751 [2] NCCL INFO Channel 00/0 : 38[85000] -> 39[c7000] via P2P/IPC +gpub052:1901669:1901751 [2] NCCL INFO Channel 01/0 : 38[85000] -> 39[c7000] via P2P/IPC +gpub052:1901669:1901751 [2] NCCL INFO Connected all rings +gpub052:1901669:1901751 [2] NCCL INFO Channel 00/0 : 38[85000] -> 37[46000] via P2P/IPC +gpub052:1901669:1901751 [2] NCCL INFO Channel 01/0 : 38[85000] -> 37[46000] via P2P/IPC +gpub052:1901669:1901751 [2] NCCL INFO Connected all trees +gpub052:1901669:1901751 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub052:1901669:1901751 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub052:1901669:1901751 [2] NCCL INFO comm 0x50c05250 rank 38 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub037:1522724:1522724 [2] NCCL INFO cudaDriverVersion 12010 +gpub037:1522724:1522724 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1522724:1522724 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1522724:1522800 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1522724:1522800 [2] NCCL INFO Using network IB +gpub037:1522724:1522800 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub037:1522724:1522800 [2] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 +gpub037:1522724:1522800 [2] NCCL INFO Channel 00/0 : 22[85000] -> 23[c7000] via P2P/IPC +gpub037:1522724:1522800 [2] NCCL INFO Channel 01/0 : 22[85000] -> 23[c7000] via P2P/IPC +gpub037:1522724:1522800 [2] NCCL INFO Connected all rings +gpub037:1522724:1522800 [2] NCCL INFO Channel 00/0 : 22[85000] -> 21[46000] via P2P/IPC +gpub037:1522724:1522800 [2] NCCL INFO Channel 01/0 : 22[85000] -> 21[46000] via P2P/IPC +gpub037:1522724:1522800 [2] NCCL INFO Connected all trees +gpub037:1522724:1522800 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub037:1522724:1522800 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1522724:1522800 [2] NCCL INFO comm 0xab8ed350 rank 22 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub082:1518445:1518445 [0] NCCL INFO cudaDriverVersion 12010 +gpub082:1518445:1518445 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.182<0> +gpub082:1518445:1518445 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub082:1518445:1518527 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.182<0> +gpub082:1518445:1518527 [0] NCCL INFO Using network IB +gpub082:1518445:1518527 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub082:1518445:1518527 [0] NCCL INFO Trees [0] 61/-1/-1->60->56 [1] 61/28/-1->60->-1 +gpub082:1518445:1518527 [0] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0 +gpub082:1518445:1518527 [0] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0 +gpub082:1518445:1518527 [0] NCCL INFO Channel 00/0 : 60[7000] -> 61[46000] via P2P/IPC +gpub082:1518445:1518527 [0] NCCL INFO Channel 01/0 : 60[7000] -> 61[46000] via P2P/IPC +gpub082:1518445:1518527 [0] NCCL INFO Connected all rings +gpub082:1518445:1518527 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [receive] via NET/IB/0 +gpub082:1518445:1518527 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [receive] via NET/IB/0 +gpub082:1518445:1518527 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [send] via NET/IB/0 +gpub082:1518445:1518527 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [send] via NET/IB/0 +gpub082:1518445:1518527 [0] NCCL INFO Connected all trees +gpub082:1518445:1518527 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub082:1518445:1518527 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub082:1518445:1518527 [0] NCCL INFO comm 0x519aa9d0 rank 60 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub081:2742229:2742229 [2] NCCL INFO cudaDriverVersion 12010 +gpub081:2742229:2742229 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.181<0> +gpub081:2742229:2742229 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub081:2742229:2742315 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.181<0> +gpub081:2742229:2742315 [2] NCCL INFO Using network IB +gpub081:2742229:2742315 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub081:2742229:2742315 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->57 +gpub081:2742229:2742315 [2] NCCL INFO Channel 00/0 : 58[85000] -> 59[c7000] via P2P/IPC +gpub081:2742229:2742315 [2] NCCL INFO Channel 01/0 : 58[85000] -> 59[c7000] via P2P/IPC +gpub081:2742229:2742315 [2] NCCL INFO Connected all rings +gpub081:2742229:2742315 [2] NCCL INFO Channel 00/0 : 58[85000] -> 57[46000] via P2P/IPC +gpub081:2742229:2742315 [2] NCCL INFO Channel 01/0 : 58[85000] -> 57[46000] via P2P/IPC +gpub081:2742229:2742315 [2] NCCL INFO Connected all trees +gpub081:2742229:2742315 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub081:2742229:2742315 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub081:2742229:2742315 [2] NCCL INFO comm 0x50f92c00 rank 58 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpub026:2433085:2433085 [1] NCCL INFO cudaDriverVersion 12010 +gpub026:2433085:2433085 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.126<0> +gpub026:2433085:2433085 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub026:2433085:2433165 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.126<0> +gpub026:2433085:2433165 [1] NCCL INFO Using network IB +gpub026:2433085:2433165 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub026:2433085:2433165 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/8/-1->5->4 +gpub026:2433085:2433165 [1] NCCL INFO Channel 00/0 : 5[46000] -> 6[85000] via P2P/IPC +gpub026:2433085:2433165 [1] NCCL INFO Channel 01/0 : 5[46000] -> 6[85000] via P2P/IPC +gpub026:2433085:2433165 [1] NCCL INFO Connected all rings +gpub026:2433085:2433165 [1] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [send] via NET/IB/0 +gpub026:2433085:2433165 [1] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [receive] via NET/IB/0 +gpub026:2433085:2433165 [1] NCCL INFO Channel 00/0 : 5[46000] -> 4[7000] via P2P/IPC +gpub026:2433085:2433165 [1] NCCL INFO Channel 01/0 : 5[46000] -> 4[7000] via P2P/IPC +gpub026:2433085:2433165 [1] NCCL INFO Connected all trees +gpub026:2433085:2433165 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub026:2433085:2433165 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub026:2433085:2433165 [1] NCCL INFO comm 0xb7dab990 rank 5 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub049:4064875:4064875 [1] NCCL INFO cudaDriverVersion 12010 +gpub049:4064875:4064875 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.149<0> +gpub049:4064875:4064875 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub049:4064875:4064940 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.149<0> +gpub049:4064875:4064940 [1] NCCL INFO Using network IB +gpub049:4064875:4064940 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub049:4064875:4064940 [1] NCCL INFO Trees [0] 26/20/-1->25->24 [1] 26/-1/-1->25->24 +gpub049:4064875:4064940 [1] NCCL INFO Channel 00/0 : 25[46000] -> 26[85000] via P2P/IPC +gpub049:4064875:4064940 [1] NCCL INFO Channel 01/0 : 25[46000] -> 26[85000] via P2P/IPC +gpub049:4064875:4064940 [1] NCCL INFO Connected all rings +gpub049:4064875:4064940 [1] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [receive] via NET/IB/0 +gpub049:4064875:4064940 [1] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [send] via NET/IB/0 +gpub049:4064875:4064940 [1] NCCL INFO Channel 00/0 : 25[46000] -> 24[7000] via P2P/IPC +gpub049:4064875:4064940 [1] NCCL INFO Channel 01/0 : 25[46000] -> 24[7000] via P2P/IPC +gpub049:4064875:4064940 [1] NCCL INFO Connected all trees +gpub049:4064875:4064940 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub049:4064875:4064940 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub049:4064875:4064940 [1] NCCL INFO comm 0xa8769be0 rank 25 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpub037:1522722:1522722 [0] NCCL INFO cudaDriverVersion 12010 +gpub037:1522722:1522722 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1522722:1522722 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1522722:1522802 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1522722:1522802 [0] NCCL INFO Using network IB +gpub037:1522722:1522802 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub037:1522722:1522802 [0] NCCL INFO Trees [0] 21/-1/-1->20->25 [1] 21/16/-1->20->13 +gpub037:1522722:1522802 [0] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 00/0 : 20[7000] -> 21[46000] via P2P/IPC +gpub037:1522722:1522802 [0] NCCL INFO Channel 01/0 : 20[7000] -> 21[46000] via P2P/IPC +gpub037:1522722:1522802 [0] NCCL INFO Connected all rings +gpub037:1522722:1522802 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [receive] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [send] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [receive] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [send] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [receive] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [send] via NET/IB/0 +gpub037:1522722:1522802 [0] NCCL INFO Connected all trees +gpub037:1522722:1522802 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub037:1522722:1522802 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1522722:1522802 [0] NCCL INFO comm 0x514cae40 rank 20 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpub053:1664489:1664489 [3] NCCL INFO cudaDriverVersion 12010 +gpub053:1664489:1664489 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.153<0> +gpub053:1664489:1664489 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub053:1664489:1664559 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.153<0> +gpub053:1664489:1664559 [3] NCCL INFO Using network IB +gpub053:1664489:1664559 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub053:1664489:1664559 [3] NCCL INFO Trees [0] -1/-1/-1->43->42 [1] -1/-1/-1->43->42 +gpub053:1664489:1664559 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0 +gpub053:1664489:1664559 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0 +gpub053:1664489:1664559 [3] NCCL INFO Connected all rings +gpub053:1664489:1664559 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 42[85000] via P2P/IPC +gpub053:1664489:1664559 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 42[85000] via P2P/IPC +gpub053:1664489:1664559 [3] NCCL INFO Connected all trees +gpub053:1664489:1664559 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub053:1664489:1664559 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub053:1664489:1664559 [3] NCCL INFO comm 0xa9e28fe0 rank 43 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub037:1522725:1522725 [3] NCCL INFO cudaDriverVersion 12010 +gpub037:1522725:1522725 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1522725:1522725 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1522725:1522801 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1522725:1522801 [3] NCCL INFO Using network IB +gpub037:1522725:1522801 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub037:1522725:1522801 [3] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] -1/-1/-1->23->22 +gpub037:1522725:1522801 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0 +gpub037:1522725:1522801 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0 +gpub037:1522725:1522801 [3] NCCL INFO Connected all rings +gpub037:1522725:1522801 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 22[85000] via P2P/IPC +gpub037:1522725:1522801 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 22[85000] via P2P/IPC +gpub037:1522725:1522801 [3] NCCL INFO Connected all trees +gpub037:1522725:1522801 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub037:1522725:1522801 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1522725:1522801 [3] NCCL INFO comm 0x4f7df910 rank 23 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpub036:1870499:1870499 [3] NCCL INFO cudaDriverVersion 12010 +gpub036:1870499:1870499 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.136<0> +gpub036:1870499:1870499 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub036:1870499:1870581 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.136<0> +gpub036:1870499:1870581 [3] NCCL INFO Using network IB +gpub036:1870499:1870581 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub036:1870499:1870581 [3] NCCL INFO Trees [0] -1/-1/-1->19->18 [1] -1/-1/-1->19->18 +gpub036:1870499:1870581 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0 +gpub036:1870499:1870581 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0 +gpub036:1870499:1870581 [3] NCCL INFO Connected all rings +gpub036:1870499:1870581 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 18[85000] via P2P/IPC +gpub036:1870499:1870581 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 18[85000] via P2P/IPC +gpub036:1870499:1870581 [3] NCCL INFO Connected all trees +gpub036:1870499:1870581 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpub036:1870499:1870581 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub036:1870499:1870581 [3] NCCL INFO comm 0xa269c50 rank 19 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[gpub015:0/64] 2023-07-04 13:14:42,343 (trainer:732) INFO: 11epoch:train:1-100batch: iter_time=1.167, forward_time=0.250, loss_ctc=83.800, loss_att=63.903, acc=0.671, loss=69.872, backward_time=1.249, grad_norm=95.018, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.181, optim0_lr0=1.178e-04, train_time=5.980 +[gpub015:0/64] 2023-07-04 13:17:19,812 (trainer:732) INFO: 11epoch:train:101-200batch: iter_time=9.388e-05, forward_time=0.141, loss_ctc=70.875, loss_att=53.678, acc=0.690, loss=58.837, backward_time=1.239, grad_norm=82.003, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.178e-04, train_time=3.150 +[gpub015:0/64] 2023-07-04 13:19:56,533 (trainer:732) INFO: 11epoch:train:201-300batch: iter_time=9.683e-05, forward_time=0.140, loss_ctc=73.277, loss_att=62.501, acc=0.661, loss=65.734, backward_time=1.236, grad_norm=87.548, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.177e-04, train_time=3.134 +[gpub015:0/64] 2023-07-04 13:22:35,853 (trainer:732) INFO: 11epoch:train:301-400batch: iter_time=1.014e-04, forward_time=0.141, loss_ctc=79.358, loss_att=59.074, acc=0.690, loss=65.159, backward_time=1.241, grad_norm=91.769, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.176e-04, train_time=3.186 +[gpub015:0/64] 2023-07-04 13:25:20,103 (trainer:732) INFO: 11epoch:train:401-500batch: iter_time=1.025e-04, forward_time=0.140, loss_ctc=72.801, loss_att=54.986, acc=0.664, loss=60.331, backward_time=1.248, grad_norm=99.444, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.176e-04, train_time=3.285 +[gpub015:0/64] 2023-07-04 13:28:00,155 (trainer:732) INFO: 11epoch:train:501-600batch: iter_time=9.845e-05, forward_time=0.140, loss_ctc=72.472, loss_att=58.148, acc=0.641, loss=62.446, backward_time=1.241, grad_norm=85.575, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.175e-04, train_time=3.201 +[gpub015:0/64] 2023-07-04 13:30:47,766 (trainer:732) INFO: 11epoch:train:601-700batch: iter_time=1.032e-04, forward_time=0.142, loss_ctc=74.485, loss_att=62.987, acc=0.667, loss=66.436, backward_time=1.247, grad_norm=81.333, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.174e-04, train_time=3.352 +[gpub015:0/64] 2023-07-04 13:33:37,533 (trainer:732) INFO: 11epoch:train:701-800batch: iter_time=1.023e-04, forward_time=0.140, loss_ctc=81.733, loss_att=62.947, acc=0.691, loss=68.583, backward_time=1.245, grad_norm=94.965, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.174e-04, train_time=3.395 +[gpub015:0/64] 2023-07-04 13:34:38,137 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub015:0/64] 2023-07-04 13:34:56,174 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 13:34:59,472 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 13:34:59,473 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, +[gpub015:0/64] 2023-07-04 13:34:59,479 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 13:40:59,440 (trainer:732) INFO: 11epoch:train:801-900batch: iter_time=1.311, forward_time=0.142, loss_ctc=85.490, loss_att=68.439, acc=0.664, loss=73.554, backward_time=1.245, grad_norm=86.487, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.173e-04, train_time=8.838 +[gpub015:0/64] 2023-07-04 13:43:36,689 (trainer:732) INFO: 11epoch:train:901-1000batch: iter_time=1.334e-04, forward_time=0.144, loss_ctc=72.168, loss_att=52.441, acc=0.692, loss=58.359, backward_time=1.237, grad_norm=87.029, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.172e-04, train_time=3.145 +[gpub015:0/64] 2023-07-04 13:46:13,656 (trainer:732) INFO: 11epoch:train:1001-1100batch: iter_time=1.298e-04, forward_time=0.144, loss_ctc=71.855, loss_att=63.794, acc=0.650, loss=66.212, backward_time=1.237, grad_norm=91.081, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.172e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 13:48:50,728 (trainer:732) INFO: 11epoch:train:1101-1200batch: iter_time=1.288e-04, forward_time=0.145, loss_ctc=78.754, loss_att=62.586, acc=0.672, loss=67.436, backward_time=1.237, grad_norm=94.410, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.171e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 13:51:27,702 (trainer:732) INFO: 11epoch:train:1201-1300batch: iter_time=1.265e-04, forward_time=0.144, loss_ctc=74.458, loss_att=56.373, acc=0.665, loss=61.798, backward_time=1.236, grad_norm=85.002, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.170e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 13:54:04,338 (trainer:732) INFO: 11epoch:train:1301-1400batch: iter_time=1.328e-04, forward_time=0.144, loss_ctc=74.356, loss_att=61.107, acc=0.633, loss=65.082, backward_time=1.236, grad_norm=96.927, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.170e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 13:56:40,929 (trainer:732) INFO: 11epoch:train:1401-1500batch: iter_time=1.283e-04, forward_time=0.144, loss_ctc=70.043, loss_att=58.437, acc=0.645, loss=61.919, backward_time=1.235, grad_norm=79.036, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.169e-04, train_time=3.132 +[gpub015:0/64] 2023-07-04 13:59:17,863 (trainer:732) INFO: 11epoch:train:1501-1600batch: iter_time=1.270e-04, forward_time=0.145, loss_ctc=75.925, loss_att=57.323, acc=0.695, loss=62.903, backward_time=1.237, grad_norm=80.385, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.168e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 14:01:03,303 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub015:0/64] 2023-07-04 14:01:21,149 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 14:01:24,530 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 14:01:24,530 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, +[gpub015:0/64] 2023-07-04 14:01:24,537 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 14:05:06,649 (trainer:732) INFO: 11epoch:train:1601-1700batch: iter_time=1.217, forward_time=0.143, loss_ctc=84.818, loss_att=70.745, acc=0.672, loss=74.967, backward_time=1.246, grad_norm=97.726, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.178, optim0_lr0=1.168e-04, train_time=6.976 +[gpub015:0/64] 2023-07-04 14:07:44,458 (trainer:732) INFO: 11epoch:train:1701-1800batch: iter_time=1.115e-04, forward_time=0.144, loss_ctc=71.343, loss_att=51.236, acc=0.688, loss=57.268, backward_time=1.239, grad_norm=103.277, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.179, optim0_lr0=1.167e-04, train_time=3.156 +[gpub015:0/64] 2023-07-04 14:10:21,624 (trainer:732) INFO: 11epoch:train:1801-1900batch: iter_time=1.042e-04, forward_time=0.143, loss_ctc=73.993, loss_att=60.990, acc=0.670, loss=64.891, backward_time=1.239, grad_norm=98.902, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.179, optim0_lr0=1.167e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 14:12:58,635 (trainer:732) INFO: 11epoch:train:1901-2000batch: iter_time=1.055e-04, forward_time=0.143, loss_ctc=77.280, loss_att=60.422, acc=0.672, loss=65.479, backward_time=1.239, grad_norm=99.361, clip=100.000, loss_scale=2.749e+11, optim_step_time=0.179, optim0_lr0=1.166e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 14:15:35,403 (trainer:732) INFO: 11epoch:train:2001-2100batch: iter_time=1.027e-04, forward_time=0.144, loss_ctc=71.434, loss_att=55.009, acc=0.670, loss=59.936, backward_time=1.236, grad_norm=89.022, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.165e-04, train_time=3.135 +[gpub015:0/64] 2023-07-04 14:18:12,306 (trainer:732) INFO: 11epoch:train:2101-2200batch: iter_time=9.694e-05, forward_time=0.143, loss_ctc=80.484, loss_att=62.826, acc=0.656, loss=68.123, backward_time=1.238, grad_norm=91.242, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.165e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 14:20:48,909 (trainer:732) INFO: 11epoch:train:2201-2300batch: iter_time=1.006e-04, forward_time=0.142, loss_ctc=66.251, loss_att=54.235, acc=0.643, loss=57.840, backward_time=1.236, grad_norm=75.648, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.164e-04, train_time=3.132 +[gpub015:0/64] 2023-07-04 14:23:25,954 (trainer:732) INFO: 11epoch:train:2301-2400batch: iter_time=9.957e-05, forward_time=0.143, loss_ctc=76.969, loss_att=61.945, acc=0.675, loss=66.452, backward_time=1.239, grad_norm=86.558, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.163e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 14:26:02,381 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub015:0/64] 2023-07-04 14:26:20,628 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 14:26:24,038 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 14:26:24,038 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, +[gpub015:0/64] 2023-07-04 14:26:24,044 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 14:29:27,606 (trainer:732) INFO: 11epoch:train:2401-2500batch: iter_time=1.234, forward_time=0.142, loss_ctc=83.156, loss_att=65.525, acc=0.675, loss=70.814, backward_time=1.245, grad_norm=87.609, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.163e-04, train_time=7.233 +[gpub015:0/64] 2023-07-04 14:32:05,977 (trainer:732) INFO: 11epoch:train:2501-2600batch: iter_time=9.809e-05, forward_time=0.143, loss_ctc=78.756, loss_att=58.058, acc=0.689, loss=64.267, backward_time=1.245, grad_norm=89.189, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.162e-04, train_time=3.167 +[gpub015:0/64] 2023-07-04 14:34:43,361 (trainer:732) INFO: 11epoch:train:2601-2700batch: iter_time=8.955e-05, forward_time=0.141, loss_ctc=73.761, loss_att=55.772, acc=0.686, loss=61.169, backward_time=1.240, grad_norm=84.815, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.162e-04, train_time=3.147 +[gpub015:0/64] 2023-07-04 14:37:20,015 (trainer:732) INFO: 11epoch:train:2701-2800batch: iter_time=1.035e-04, forward_time=0.142, loss_ctc=73.778, loss_att=62.326, acc=0.658, loss=65.762, backward_time=1.236, grad_norm=88.821, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.161e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 14:39:56,726 (trainer:732) INFO: 11epoch:train:2801-2900batch: iter_time=1.067e-04, forward_time=0.142, loss_ctc=77.384, loss_att=58.538, acc=0.690, loss=64.192, backward_time=1.237, grad_norm=82.802, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.160e-04, train_time=3.134 +[gpub015:0/64] 2023-07-04 14:42:33,340 (trainer:732) INFO: 11epoch:train:2901-3000batch: iter_time=1.122e-04, forward_time=0.142, loss_ctc=70.876, loss_att=53.471, acc=0.667, loss=58.692, backward_time=1.236, grad_norm=78.599, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.160e-04, train_time=3.132 +[gpub015:0/64] 2023-07-04 14:45:10,899 (trainer:732) INFO: 11epoch:train:3001-3100batch: iter_time=1.203e-04, forward_time=0.142, loss_ctc=71.915, loss_att=58.489, acc=0.639, loss=62.517, backward_time=1.238, grad_norm=79.638, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.159e-04, train_time=3.151 +[gpub015:0/64] 2023-07-04 14:47:55,543 (trainer:732) INFO: 11epoch:train:3101-3200batch: iter_time=1.053e-04, forward_time=0.143, loss_ctc=73.017, loss_att=63.582, acc=0.657, loss=66.413, backward_time=1.249, grad_norm=104.209, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.158e-04, train_time=3.293 +[gpub015:0/64] 2023-07-04 14:50:39,507 (trainer:732) INFO: 11epoch:train:3201-3300batch: iter_time=1.197e-04, forward_time=0.143, loss_ctc=79.855, loss_att=60.339, acc=0.696, loss=66.194, backward_time=1.242, grad_norm=90.048, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.158e-04, train_time=3.279 +[gpub015:0/64] 2023-07-04 14:51:31,262 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub015:0/64] 2023-07-04 14:51:49,250 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 14:51:52,772 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.8", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 14:51:52,773 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.8, +[gpub015:0/64] 2023-07-04 14:51:52,779 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 14:57:01,433 (trainer:732) INFO: 11epoch:train:3301-3400batch: iter_time=1.215, forward_time=0.143, loss_ctc=83.729, loss_att=64.933, acc=0.671, loss=70.572, backward_time=1.246, grad_norm=93.661, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.157e-04, train_time=7.638 +[gpub015:0/64] 2023-07-04 14:59:38,410 (trainer:732) INFO: 11epoch:train:3401-3500batch: iter_time=1.059e-04, forward_time=0.142, loss_ctc=72.534, loss_att=53.184, acc=0.696, loss=58.989, backward_time=1.236, grad_norm=82.069, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.157e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 15:02:15,990 (trainer:732) INFO: 11epoch:train:3501-3600batch: iter_time=1.292e-04, forward_time=0.142, loss_ctc=72.125, loss_att=60.773, acc=0.657, loss=64.179, backward_time=1.242, grad_norm=79.186, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.156e-04, train_time=3.151 +[gpub015:0/64] 2023-07-04 15:04:52,996 (trainer:732) INFO: 11epoch:train:3601-3700batch: iter_time=1.228e-04, forward_time=0.144, loss_ctc=76.260, loss_att=59.534, acc=0.681, loss=64.551, backward_time=1.239, grad_norm=93.112, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.155e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 15:07:29,645 (trainer:732) INFO: 11epoch:train:3701-3800batch: iter_time=1.134e-04, forward_time=0.143, loss_ctc=75.428, loss_att=54.548, acc=0.673, loss=60.812, backward_time=1.236, grad_norm=79.497, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.155e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 15:10:06,182 (trainer:732) INFO: 11epoch:train:3801-3900batch: iter_time=1.151e-04, forward_time=0.142, loss_ctc=71.937, loss_att=59.873, acc=0.630, loss=63.492, backward_time=1.235, grad_norm=89.457, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.154e-04, train_time=3.131 +[gpub015:0/64] 2023-07-04 15:12:43,022 (trainer:732) INFO: 11epoch:train:3901-4000batch: iter_time=1.081e-04, forward_time=0.143, loss_ctc=71.573, loss_att=63.440, acc=0.652, loss=65.880, backward_time=1.237, grad_norm=79.058, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.153e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 15:15:19,772 (trainer:732) INFO: 11epoch:train:4001-4100batch: iter_time=1.220e-04, forward_time=0.143, loss_ctc=74.521, loss_att=56.145, acc=0.691, loss=61.658, backward_time=1.237, grad_norm=80.871, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.179, optim0_lr0=1.153e-04, train_time=3.135 +[gpub015:0/64] 2023-07-04 15:17:04,683 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub015:0/64] 2023-07-04 15:17:22,602 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 15:17:26,004 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 15:17:26,005 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, +[gpub015:0/64] 2023-07-04 15:17:26,011 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 15:21:23,202 (trainer:732) INFO: 11epoch:train:4101-4200batch: iter_time=1.199, forward_time=0.143, loss_ctc=81.712, loss_att=65.217, acc=0.675, loss=70.165, backward_time=1.247, grad_norm=87.365, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.179, optim0_lr0=1.152e-04, train_time=7.268 +[gpub015:0/64] 2023-07-04 15:24:00,969 (trainer:732) INFO: 11epoch:train:4201-4300batch: iter_time=1.045e-04, forward_time=0.144, loss_ctc=72.537, loss_att=51.252, acc=0.700, loss=57.638, backward_time=1.240, grad_norm=85.262, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.152e-04, train_time=3.155 +[gpub015:0/64] 2023-07-04 15:26:37,814 (trainer:732) INFO: 11epoch:train:4301-4400batch: iter_time=1.208e-04, forward_time=0.142, loss_ctc=71.997, loss_att=59.331, acc=0.679, loss=63.131, backward_time=1.238, grad_norm=103.945, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.151e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 15:29:14,481 (trainer:732) INFO: 11epoch:train:4401-4500batch: iter_time=1.185e-04, forward_time=0.143, loss_ctc=78.545, loss_att=61.794, acc=0.684, loss=66.819, backward_time=1.237, grad_norm=96.020, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.150e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 15:31:51,324 (trainer:732) INFO: 11epoch:train:4501-4600batch: iter_time=1.079e-04, forward_time=0.144, loss_ctc=72.669, loss_att=54.380, acc=0.683, loss=59.867, backward_time=1.236, grad_norm=94.169, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.150e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 15:34:28,101 (trainer:732) INFO: 11epoch:train:4601-4700batch: iter_time=1.150e-04, forward_time=0.142, loss_ctc=78.958, loss_att=61.475, acc=0.651, loss=66.720, backward_time=1.237, grad_norm=82.362, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.149e-04, train_time=3.135 +[gpub015:0/64] 2023-07-04 15:37:05,127 (trainer:732) INFO: 11epoch:train:4701-4800batch: iter_time=1.123e-04, forward_time=0.144, loss_ctc=67.006, loss_att=56.681, acc=0.658, loss=59.779, backward_time=1.237, grad_norm=76.677, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.149e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 15:39:42,072 (trainer:732) INFO: 11epoch:train:4801-4900batch: iter_time=1.144e-04, forward_time=0.142, loss_ctc=76.017, loss_att=60.305, acc=0.690, loss=65.018, backward_time=1.238, grad_norm=93.325, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.148e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 15:42:18,923 (trainer:732) INFO: 11epoch:train:4901-5000batch: iter_time=1.163e-04, forward_time=0.143, loss_ctc=80.694, loss_att=63.853, acc=0.683, loss=68.905, backward_time=1.239, grad_norm=94.593, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.147e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 15:42:20,304 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub015:0/64] 2023-07-04 15:42:38,424 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 15:42:41,782 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 15:42:41,782 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, +[gpub015:0/64] 2023-07-04 15:42:41,788 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 15:48:19,632 (trainer:732) INFO: 11epoch:train:5001-5100batch: iter_time=1.237, forward_time=0.144, loss_ctc=78.222, loss_att=59.300, acc=0.679, loss=64.976, backward_time=1.247, grad_norm=95.238, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.147e-04, train_time=7.214 +[gpub015:0/64] 2023-07-04 15:50:56,912 (trainer:732) INFO: 11epoch:train:5101-5200batch: iter_time=1.348e-04, forward_time=0.142, loss_ctc=70.037, loss_att=54.648, acc=0.687, loss=59.265, backward_time=1.236, grad_norm=85.950, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.146e-04, train_time=3.145 +[gpub015:0/64] 2023-07-04 15:53:33,723 (trainer:732) INFO: 11epoch:train:5201-5300batch: iter_time=1.410e-04, forward_time=0.143, loss_ctc=74.060, loss_att=61.954, acc=0.662, loss=65.585, backward_time=1.236, grad_norm=92.304, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.146e-04, train_time=3.136 +[gpub015:0/64] 2023-07-04 15:56:10,431 (trainer:732) INFO: 11epoch:train:5301-5400batch: iter_time=1.411e-04, forward_time=0.144, loss_ctc=74.817, loss_att=56.868, acc=0.686, loss=62.253, backward_time=1.235, grad_norm=79.959, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.145e-04, train_time=3.134 +[gpub015:0/64] 2023-07-04 15:58:47,203 (trainer:732) INFO: 11epoch:train:5401-5500batch: iter_time=1.433e-04, forward_time=0.144, loss_ctc=73.246, loss_att=57.581, acc=0.663, loss=62.281, backward_time=1.236, grad_norm=89.297, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.144e-04, train_time=3.135 +[gpub015:0/64] 2023-07-04 16:01:23,856 (trainer:732) INFO: 11epoch:train:5501-5600batch: iter_time=1.234e-04, forward_time=0.143, loss_ctc=69.266, loss_att=52.780, acc=0.658, loss=57.726, backward_time=1.235, grad_norm=84.838, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.144e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 16:04:00,841 (trainer:732) INFO: 11epoch:train:5601-5700batch: iter_time=1.416e-04, forward_time=0.144, loss_ctc=70.474, loss_att=61.191, acc=0.661, loss=63.976, backward_time=1.238, grad_norm=85.157, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.143e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 16:06:37,726 (trainer:732) INFO: 11epoch:train:5701-5800batch: iter_time=1.304e-04, forward_time=0.143, loss_ctc=83.723, loss_att=67.490, acc=0.687, loss=72.360, backward_time=1.237, grad_norm=82.489, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.143e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 16:07:32,415 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub015:0/64] 2023-07-04 16:07:50,213 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 16:07:53,634 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 16:07:53,634 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, +[gpub015:0/64] 2023-07-04 16:07:53,641 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 16:12:26,124 (trainer:732) INFO: 11epoch:train:5801-5900batch: iter_time=1.216, forward_time=0.142, loss_ctc=78.153, loss_att=58.359, acc=0.683, loss=64.297, backward_time=1.245, grad_norm=81.488, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.142e-04, train_time=6.968 +[gpub015:0/64] 2023-07-04 16:15:03,120 (trainer:732) INFO: 11epoch:train:5901-6000batch: iter_time=1.015e-04, forward_time=0.141, loss_ctc=75.076, loss_att=52.764, acc=0.698, loss=59.457, backward_time=1.236, grad_norm=87.247, clip=100.000, loss_scale=5.498e+11, optim_step_time=0.178, optim0_lr0=1.141e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 16:17:40,160 (trainer:732) INFO: 11epoch:train:6001-6100batch: iter_time=1.041e-04, forward_time=0.142, loss_ctc=72.470, loss_att=62.178, acc=0.673, loss=65.265, backward_time=1.237, grad_norm=83.009, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.141e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 16:20:17,104 (trainer:732) INFO: 11epoch:train:6101-6200batch: iter_time=1.004e-04, forward_time=0.144, loss_ctc=74.977, loss_att=58.241, acc=0.691, loss=63.262, backward_time=1.238, grad_norm=88.495, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.140e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 16:22:54,294 (trainer:732) INFO: 11epoch:train:6201-6300batch: iter_time=1.117e-04, forward_time=0.142, loss_ctc=73.711, loss_att=55.176, acc=0.683, loss=60.737, backward_time=1.237, grad_norm=84.983, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.140e-04, train_time=3.144 +[gpub015:0/64] 2023-07-04 16:25:30,906 (trainer:732) INFO: 11epoch:train:6301-6400batch: iter_time=1.066e-04, forward_time=0.142, loss_ctc=75.193, loss_att=58.962, acc=0.652, loss=63.832, backward_time=1.237, grad_norm=87.664, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.139e-04, train_time=3.132 +[gpub015:0/64] 2023-07-04 16:28:07,562 (trainer:732) INFO: 11epoch:train:6401-6500batch: iter_time=1.093e-04, forward_time=0.142, loss_ctc=67.375, loss_att=55.362, acc=0.673, loss=58.966, backward_time=1.236, grad_norm=87.593, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.138e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 16:30:44,706 (trainer:732) INFO: 11epoch:train:6501-6600batch: iter_time=1.079e-04, forward_time=0.144, loss_ctc=74.061, loss_att=54.804, acc=0.709, loss=60.581, backward_time=1.238, grad_norm=79.167, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.138e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 16:32:33,226 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub015:0/64] 2023-07-04 16:32:51,116 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 16:32:54,503 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 16:32:54,503 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, +[gpub015:0/64] 2023-07-04 16:32:54,509 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 16:36:49,987 (trainer:732) INFO: 11epoch:train:6601-6700batch: iter_time=1.193, forward_time=0.144, loss_ctc=83.775, loss_att=70.165, acc=0.684, loss=74.248, backward_time=1.261, grad_norm=89.442, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.137e-04, train_time=7.305 +[gpub015:0/64] 2023-07-04 16:39:27,770 (trainer:732) INFO: 11epoch:train:6701-6800batch: iter_time=1.102e-04, forward_time=0.142, loss_ctc=72.289, loss_att=51.250, acc=0.688, loss=57.562, backward_time=1.238, grad_norm=89.186, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.137e-04, train_time=3.155 +[gpub015:0/64] 2023-07-04 16:42:09,188 (trainer:732) INFO: 11epoch:train:6801-6900batch: iter_time=1.029e-04, forward_time=0.143, loss_ctc=73.752, loss_att=61.099, acc=0.672, loss=64.895, backward_time=1.240, grad_norm=83.648, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.136e-04, train_time=3.228 +[gpub015:0/64] 2023-07-04 16:44:51,998 (trainer:732) INFO: 11epoch:train:6901-7000batch: iter_time=1.079e-04, forward_time=0.150, loss_ctc=74.119, loss_att=58.766, acc=0.679, loss=63.372, backward_time=1.244, grad_norm=117.258, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.177, optim0_lr0=1.135e-04, train_time=3.256 +[gpub015:0/64] 2023-07-04 16:47:37,040 (trainer:732) INFO: 11epoch:train:7001-7100batch: iter_time=1.079e-04, forward_time=0.144, loss_ctc=70.548, loss_att=54.290, acc=0.676, loss=59.167, backward_time=1.239, grad_norm=82.573, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.177, optim0_lr0=1.135e-04, train_time=3.301 +[gpub015:0/64] 2023-07-04 16:50:15,687 (trainer:732) INFO: 11epoch:train:7101-7200batch: iter_time=1.151e-04, forward_time=0.146, loss_ctc=77.779, loss_att=60.926, acc=0.663, loss=65.982, backward_time=1.237, grad_norm=82.395, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.177, optim0_lr0=1.134e-04, train_time=3.173 +[gpub015:0/64] 2023-07-04 16:52:52,296 (trainer:732) INFO: 11epoch:train:7201-7300batch: iter_time=1.123e-04, forward_time=0.141, loss_ctc=68.907, loss_att=54.263, acc=0.649, loss=58.656, backward_time=1.235, grad_norm=81.090, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.134e-04, train_time=3.132 +[gpub015:0/64] 2023-07-04 16:55:29,304 (trainer:732) INFO: 11epoch:train:7301-7400batch: iter_time=1.139e-04, forward_time=0.143, loss_ctc=73.895, loss_att=61.349, acc=0.678, loss=65.113, backward_time=1.238, grad_norm=86.829, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.133e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 16:58:06,207 (trainer:732) INFO: 11epoch:train:7401-7500batch: iter_time=1.001e-04, forward_time=0.142, loss_ctc=80.322, loss_att=63.802, acc=0.685, loss=68.758, backward_time=1.236, grad_norm=94.311, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.133e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 16:58:11,666 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub015:0/64] 2023-07-04 16:58:29,313 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 16:58:32,661 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 16:58:32,661 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, +[gpub015:0/64] 2023-07-04 16:58:32,667 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 17:05:39,174 (trainer:732) INFO: 11epoch:train:7501-7600batch: iter_time=1.528, forward_time=0.144, loss_ctc=78.582, loss_att=59.317, acc=0.684, loss=65.097, backward_time=1.252, grad_norm=88.598, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.132e-04, train_time=9.059 +[gpub015:0/64] 2023-07-04 17:08:17,107 (trainer:732) INFO: 11epoch:train:7601-7700batch: iter_time=1.000e-04, forward_time=0.143, loss_ctc=69.421, loss_att=54.039, acc=0.703, loss=58.653, backward_time=1.241, grad_norm=91.904, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.131e-04, train_time=3.158 +[gpub015:0/64] 2023-07-04 17:10:54,037 (trainer:732) INFO: 11epoch:train:7701-7800batch: iter_time=1.044e-04, forward_time=0.142, loss_ctc=72.978, loss_att=61.569, acc=0.673, loss=64.992, backward_time=1.239, grad_norm=94.885, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.131e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 17:13:31,101 (trainer:732) INFO: 11epoch:train:7801-7900batch: iter_time=9.732e-05, forward_time=0.143, loss_ctc=75.174, loss_att=54.680, acc=0.694, loss=60.828, backward_time=1.239, grad_norm=80.141, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.130e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 17:16:07,930 (trainer:732) INFO: 11epoch:train:7901-8000batch: iter_time=1.003e-04, forward_time=0.143, loss_ctc=72.983, loss_att=56.893, acc=0.676, loss=61.720, backward_time=1.238, grad_norm=83.576, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.130e-04, train_time=3.136 +[gpub015:0/64] 2023-07-04 17:18:44,592 (trainer:732) INFO: 11epoch:train:8001-8100batch: iter_time=1.019e-04, forward_time=0.143, loss_ctc=71.290, loss_att=54.002, acc=0.666, loss=59.188, backward_time=1.237, grad_norm=104.543, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.129e-04, train_time=3.133 +[gpub015:0/64] 2023-07-04 17:21:21,553 (trainer:732) INFO: 11epoch:train:8101-8200batch: iter_time=1.072e-04, forward_time=0.143, loss_ctc=71.502, loss_att=61.136, acc=0.676, loss=64.245, backward_time=1.238, grad_norm=79.194, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.129e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 17:23:58,799 (trainer:732) INFO: 11epoch:train:8201-8300batch: iter_time=9.818e-05, forward_time=0.144, loss_ctc=81.675, loss_att=65.649, acc=0.699, loss=70.457, backward_time=1.241, grad_norm=95.853, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.128e-04, train_time=3.145 +[gpub015:0/64] 2023-07-04 17:24:53,534 (multiple_iter_factory:32) INFO: Building 10th iter-factory... +[gpub015:0/64] 2023-07-04 17:25:11,166 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 17:25:14,530 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 17:25:14,530 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, +[gpub015:0/64] 2023-07-04 17:25:14,536 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 17:29:58,326 (trainer:732) INFO: 11epoch:train:8301-8400batch: iter_time=1.239, forward_time=0.142, loss_ctc=77.508, loss_att=57.390, acc=0.690, loss=63.426, backward_time=1.251, grad_norm=84.822, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.127e-04, train_time=7.190 +[gpub015:0/64] 2023-07-04 17:32:39,866 (trainer:732) INFO: 11epoch:train:8401-8500batch: iter_time=1.145e-04, forward_time=0.143, loss_ctc=71.150, loss_att=51.139, acc=0.704, loss=57.143, backward_time=1.244, grad_norm=74.997, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.127e-04, train_time=3.231 +[gpub015:0/64] 2023-07-04 17:35:18,227 (trainer:732) INFO: 11epoch:train:8501-8600batch: iter_time=1.087e-04, forward_time=0.142, loss_ctc=70.281, loss_att=61.581, acc=0.677, loss=64.191, backward_time=1.238, grad_norm=88.779, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.126e-04, train_time=3.167 +[gpub015:0/64] 2023-07-04 17:38:03,061 (trainer:732) INFO: 11epoch:train:8601-8700batch: iter_time=0.002, forward_time=0.166, loss_ctc=74.328, loss_att=57.549, acc=0.693, loss=62.583, backward_time=1.255, grad_norm=120.714, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.126e-04, train_time=3.296 +[gpub015:0/64] 2023-07-04 17:40:40,597 (trainer:732) INFO: 11epoch:train:8701-8800batch: iter_time=1.146e-04, forward_time=0.144, loss_ctc=72.996, loss_att=53.756, acc=0.687, loss=59.528, backward_time=1.239, grad_norm=77.639, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.125e-04, train_time=3.150 +[gpub015:0/64] 2023-07-04 17:43:18,170 (trainer:732) INFO: 11epoch:train:8801-8900batch: iter_time=1.120e-04, forward_time=0.145, loss_ctc=72.990, loss_att=58.467, acc=0.657, loss=62.824, backward_time=1.238, grad_norm=79.134, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.183, optim0_lr0=1.125e-04, train_time=3.151 +[gpub015:0/64] 2023-07-04 17:46:06,875 (trainer:732) INFO: 11epoch:train:8901-9000batch: iter_time=1.090e-04, forward_time=0.179, loss_ctc=67.241, loss_att=54.706, acc=0.676, loss=58.467, backward_time=1.250, grad_norm=75.286, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.181, optim0_lr0=1.124e-04, train_time=3.374 +[gpub015:0/64] 2023-07-04 17:48:51,294 (trainer:732) INFO: 11epoch:train:9001-9100batch: iter_time=1.047e-04, forward_time=0.143, loss_ctc=75.541, loss_att=55.910, acc=0.706, loss=61.799, backward_time=1.246, grad_norm=79.268, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.123e-04, train_time=3.288 +[gpub015:0/64] 2023-07-04 17:50:45,067 (multiple_iter_factory:32) INFO: Building 11th iter-factory... +[gpub015:0/64] 2023-07-04 17:51:02,822 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 17:51:06,236 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.5", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 17:51:06,236 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.5, +[gpub015:0/64] 2023-07-04 17:51:06,242 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 17:55:34,613 (trainer:732) INFO: 11epoch:train:9101-9200batch: iter_time=1.493, forward_time=0.145, loss_ctc=84.206, loss_att=70.331, acc=0.687, loss=74.493, backward_time=1.252, grad_norm=92.347, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.123e-04, train_time=8.066 +[gpub015:0/64] 2023-07-04 17:58:14,578 (trainer:732) INFO: 11epoch:train:9201-9300batch: iter_time=1.225e-04, forward_time=0.144, loss_ctc=72.015, loss_att=51.510, acc=0.694, loss=57.662, backward_time=1.240, grad_norm=101.263, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.122e-04, train_time=3.199 +[gpub015:0/64] 2023-07-04 18:00:53,333 (trainer:732) INFO: 11epoch:train:9301-9400batch: iter_time=1.103e-04, forward_time=0.148, loss_ctc=71.528, loss_att=60.448, acc=0.694, loss=63.772, backward_time=1.241, grad_norm=82.338, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.122e-04, train_time=3.175 +[gpub015:0/64] 2023-07-04 18:03:34,243 (trainer:732) INFO: 11epoch:train:9401-9500batch: iter_time=1.096e-04, forward_time=0.152, loss_ctc=73.079, loss_att=58.605, acc=0.691, loss=62.947, backward_time=1.240, grad_norm=88.620, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.179, optim0_lr0=1.121e-04, train_time=3.218 +[gpub015:0/64] 2023-07-04 18:06:11,084 (trainer:732) INFO: 11epoch:train:9501-9600batch: iter_time=1.217e-04, forward_time=0.144, loss_ctc=69.793, loss_att=52.272, acc=0.687, loss=57.529, backward_time=1.237, grad_norm=86.909, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.121e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 18:08:48,008 (trainer:732) INFO: 11epoch:train:9601-9700batch: iter_time=1.100e-04, forward_time=0.143, loss_ctc=77.129, loss_att=60.450, acc=0.676, loss=65.454, backward_time=1.238, grad_norm=89.294, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.120e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 18:11:24,516 (trainer:732) INFO: 11epoch:train:9701-9800batch: iter_time=1.087e-04, forward_time=0.142, loss_ctc=66.975, loss_att=52.929, acc=0.666, loss=57.143, backward_time=1.236, grad_norm=82.429, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.119e-04, train_time=3.130 +[gpub015:0/64] 2023-07-04 18:14:01,398 (trainer:732) INFO: 11epoch:train:9801-9900batch: iter_time=1.081e-04, forward_time=0.143, loss_ctc=74.627, loss_att=61.438, acc=0.686, loss=65.395, backward_time=1.237, grad_norm=80.146, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.119e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 18:16:38,364 (trainer:732) INFO: 11epoch:train:9901-10000batch: iter_time=1.095e-04, forward_time=0.144, loss_ctc=80.252, loss_att=63.991, acc=0.691, loss=68.869, backward_time=1.238, grad_norm=89.399, clip=100.000, loss_scale=1.100e+12, optim_step_time=0.178, optim0_lr0=1.118e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 18:29:09,297 (trainer:338) INFO: 11epoch results: [train] iter_time=0.153, forward_time=0.145, loss_ctc=74.807, loss_att=58.923, acc=0.676, loss=63.688, backward_time=1.240, grad_norm=88.098, clip=100.000, loss_scale=7.147e+11, optim_step_time=0.178, optim0_lr0=1.147e-04, train_time=3.683, time=5 hours, 7 minutes and 6.9 seconds, total_count=80000, gpu_max_cached_mem_GB=33.838, [valid] loss_ctc=61.632, cer_ctc=0.324, loss_att=49.266, acc=0.614, cer=0.445, wer=1.000, loss=52.976, time=6 minutes and 24.4 seconds, total_count=8602, gpu_max_cached_mem_GB=37.133, [att_plot] time=5 minutes and 54.75 seconds, total_count=0, gpu_max_cached_mem_GB=37.133 +[gpub015:0/64] 2023-07-04 18:29:26,715 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub015:0/64] 2023-07-04 18:29:26,720 (trainer:440) INFO: The model files were removed: exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/6epoch.pth +[gpub015:0/64] 2023-07-04 18:29:26,777 (trainer:272) INFO: 12/100epoch started. Estimated time to finish: 2 weeks, 5 days and 18 hours +[gpub015:0/64] 2023-07-04 18:29:28,074 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub015:0/64] 2023-07-04 18:29:45,861 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 18:29:49,136 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 18:29:49,136 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, +[gpub015:0/64] 2023-07-04 18:29:49,212 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 18:35:37,004 (trainer:732) INFO: 12epoch:train:1-100batch: iter_time=2.064, forward_time=0.171, loss_ctc=84.536, loss_att=72.238, acc=0.649, loss=75.927, backward_time=1.250, grad_norm=121.129, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.118e-04, train_time=7.389 +[gpub015:0/64] 2023-07-04 18:38:17,593 (trainer:732) INFO: 12epoch:train:101-200batch: iter_time=1.095e-04, forward_time=0.144, loss_ctc=73.120, loss_att=54.091, acc=0.688, loss=59.800, backward_time=1.248, grad_norm=107.383, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.117e-04, train_time=3.212 +[gpub015:0/64] 2023-07-04 18:41:05,153 (trainer:732) INFO: 12epoch:train:201-300batch: iter_time=1.067e-04, forward_time=0.145, loss_ctc=70.845, loss_att=60.210, acc=0.659, loss=63.401, backward_time=1.249, grad_norm=88.294, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.117e-04, train_time=3.351 +[gpub015:0/64] 2023-07-04 18:43:44,855 (trainer:732) INFO: 12epoch:train:301-400batch: iter_time=1.144e-04, forward_time=0.144, loss_ctc=85.017, loss_att=69.739, acc=0.635, loss=74.322, backward_time=1.244, grad_norm=98.262, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.116e-04, train_time=3.194 +[gpub015:0/64] 2023-07-04 18:46:25,802 (trainer:732) INFO: 12epoch:train:401-500batch: iter_time=1.111e-04, forward_time=0.144, loss_ctc=74.240, loss_att=58.589, acc=0.662, loss=63.284, backward_time=1.244, grad_norm=86.501, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.116e-04, train_time=3.219 +[gpub015:0/64] 2023-07-04 18:49:05,301 (trainer:732) INFO: 12epoch:train:501-600batch: iter_time=1.124e-04, forward_time=0.145, loss_ctc=76.668, loss_att=63.744, acc=0.656, loss=67.622, backward_time=1.245, grad_norm=100.634, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.115e-04, train_time=3.190 +[gpub015:0/64] 2023-07-04 18:51:45,925 (trainer:732) INFO: 12epoch:train:601-700batch: iter_time=1.142e-04, forward_time=0.144, loss_ctc=81.493, loss_att=63.434, acc=0.662, loss=68.852, backward_time=1.243, grad_norm=121.231, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.114e-04, train_time=3.212 +[gpub015:0/64] 2023-07-04 18:54:26,636 (trainer:732) INFO: 12epoch:train:701-800batch: iter_time=1.127e-04, forward_time=0.145, loss_ctc=80.088, loss_att=59.983, acc=0.669, loss=66.014, backward_time=1.245, grad_norm=96.921, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.114e-04, train_time=3.214 +[gpub015:0/64] 2023-07-04 18:55:23,266 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub015:0/64] 2023-07-04 18:55:40,570 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 18:55:43,896 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 18:55:43,896 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, +[gpub015:0/64] 2023-07-04 18:55:43,902 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 18:59:47,620 (trainer:732) INFO: 12epoch:train:801-900batch: iter_time=1.289, forward_time=0.166, loss_ctc=86.833, loss_att=71.102, acc=0.669, loss=75.821, backward_time=1.252, grad_norm=107.388, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.113e-04, train_time=6.419 +[gpub015:0/64] 2023-07-04 19:02:25,633 (trainer:732) INFO: 12epoch:train:901-1000batch: iter_time=1.181e-04, forward_time=0.146, loss_ctc=74.625, loss_att=53.239, acc=0.692, loss=59.655, backward_time=1.243, grad_norm=84.263, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.113e-04, train_time=3.160 +[gpub015:0/64] 2023-07-04 19:05:02,569 (trainer:732) INFO: 12epoch:train:1001-1100batch: iter_time=1.213e-04, forward_time=0.145, loss_ctc=62.727, loss_att=49.968, acc=0.684, loss=53.796, backward_time=1.239, grad_norm=147.092, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.112e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 19:07:40,033 (trainer:732) INFO: 12epoch:train:1101-1200batch: iter_time=1.236e-04, forward_time=0.145, loss_ctc=85.676, loss_att=74.606, acc=0.647, loss=77.927, backward_time=1.242, grad_norm=97.002, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.112e-04, train_time=3.149 +[gpub015:0/64] 2023-07-04 19:10:17,138 (trainer:732) INFO: 12epoch:train:1201-1300batch: iter_time=1.088e-04, forward_time=0.144, loss_ctc=75.522, loss_att=58.402, acc=0.663, loss=63.538, backward_time=1.241, grad_norm=92.267, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.111e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 19:12:54,130 (trainer:732) INFO: 12epoch:train:1301-1400batch: iter_time=1.219e-04, forward_time=0.144, loss_ctc=72.477, loss_att=61.256, acc=0.664, loss=64.622, backward_time=1.241, grad_norm=85.313, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.111e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 19:15:31,210 (trainer:732) INFO: 12epoch:train:1401-1500batch: iter_time=1.172e-04, forward_time=0.144, loss_ctc=81.927, loss_att=61.508, acc=0.682, loss=67.634, backward_time=1.242, grad_norm=94.138, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.110e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 19:18:08,327 (trainer:732) INFO: 12epoch:train:1501-1600batch: iter_time=1.168e-04, forward_time=0.144, loss_ctc=79.107, loss_att=62.684, acc=0.673, loss=67.611, backward_time=1.242, grad_norm=109.704, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.109e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 19:20:01,909 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub015:0/64] 2023-07-04 19:20:19,627 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 19:20:23,028 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 19:20:23,028 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, +[gpub015:0/64] 2023-07-04 19:20:23,034 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 19:23:59,707 (trainer:732) INFO: 12epoch:train:1601-1700batch: iter_time=1.736, forward_time=0.146, loss_ctc=86.653, loss_att=68.901, acc=0.661, loss=74.226, backward_time=1.254, grad_norm=89.859, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.109e-04, train_time=7.027 +[gpub015:0/64] 2023-07-04 19:26:37,403 (trainer:732) INFO: 12epoch:train:1701-1800batch: iter_time=1.135e-04, forward_time=0.147, loss_ctc=77.426, loss_att=60.332, acc=0.688, loss=65.461, backward_time=1.245, grad_norm=85.067, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.108e-04, train_time=3.154 +[gpub015:0/64] 2023-07-04 19:29:14,527 (trainer:732) INFO: 12epoch:train:1801-1900batch: iter_time=1.164e-04, forward_time=0.146, loss_ctc=64.854, loss_att=49.130, acc=0.702, loss=53.847, backward_time=1.242, grad_norm=88.422, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.108e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 19:31:51,842 (trainer:732) INFO: 12epoch:train:1901-2000batch: iter_time=1.213e-04, forward_time=0.147, loss_ctc=74.672, loss_att=64.816, acc=0.661, loss=67.773, backward_time=1.243, grad_norm=86.333, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.107e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 19:34:29,137 (trainer:732) INFO: 12epoch:train:2001-2100batch: iter_time=1.190e-04, forward_time=0.147, loss_ctc=84.626, loss_att=67.119, acc=0.658, loss=72.371, backward_time=1.243, grad_norm=95.382, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.107e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 19:37:06,442 (trainer:732) INFO: 12epoch:train:2101-2200batch: iter_time=1.147e-04, forward_time=0.147, loss_ctc=70.258, loss_att=56.040, acc=0.670, loss=60.306, backward_time=1.243, grad_norm=107.485, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.106e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 19:39:43,817 (trainer:732) INFO: 12epoch:train:2201-2300batch: iter_time=1.094e-04, forward_time=0.147, loss_ctc=74.006, loss_att=60.037, acc=0.680, loss=64.228, backward_time=1.243, grad_norm=107.315, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.106e-04, train_time=3.147 +[gpub015:0/64] 2023-07-04 19:42:21,229 (trainer:732) INFO: 12epoch:train:2301-2400batch: iter_time=1.081e-04, forward_time=0.147, loss_ctc=86.224, loss_att=67.922, acc=0.683, loss=73.412, backward_time=1.243, grad_norm=104.049, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.105e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 19:44:58,641 (trainer:732) INFO: 12epoch:train:2401-2500batch: iter_time=1.103e-04, forward_time=0.148, loss_ctc=75.849, loss_att=55.993, acc=0.677, loss=61.950, backward_time=1.242, grad_norm=96.455, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.105e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 19:45:01,701 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub015:0/64] 2023-07-04 19:45:19,346 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 19:45:22,688 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 19:45:22,688 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, +[gpub015:0/64] 2023-07-04 19:45:22,694 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 19:51:26,255 (trainer:732) INFO: 12epoch:train:2501-2600batch: iter_time=1.225, forward_time=0.175, loss_ctc=81.079, loss_att=68.853, acc=0.671, loss=72.521, backward_time=1.255, grad_norm=89.970, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.104e-04, train_time=7.751 +[gpub015:0/64] 2023-07-04 19:54:04,278 (trainer:732) INFO: 12epoch:train:2601-2700batch: iter_time=1.185e-04, forward_time=0.144, loss_ctc=71.385, loss_att=52.315, acc=0.699, loss=58.036, backward_time=1.244, grad_norm=77.476, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.103e-04, train_time=3.161 +[gpub015:0/64] 2023-07-04 19:56:41,442 (trainer:732) INFO: 12epoch:train:2701-2800batch: iter_time=1.104e-04, forward_time=0.144, loss_ctc=67.155, loss_att=55.880, acc=0.678, loss=59.262, backward_time=1.242, grad_norm=77.514, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.103e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 19:59:18,844 (trainer:732) INFO: 12epoch:train:2801-2900batch: iter_time=1.069e-04, forward_time=0.145, loss_ctc=83.629, loss_att=65.947, acc=0.656, loss=71.251, backward_time=1.242, grad_norm=88.064, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.102e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 20:01:55,894 (trainer:732) INFO: 12epoch:train:2901-3000batch: iter_time=1.108e-04, forward_time=0.144, loss_ctc=74.662, loss_att=58.929, acc=0.668, loss=63.649, backward_time=1.242, grad_norm=86.125, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.102e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 20:04:33,377 (trainer:732) INFO: 12epoch:train:3001-3100batch: iter_time=1.100e-04, forward_time=0.145, loss_ctc=74.376, loss_att=60.452, acc=0.678, loss=64.629, backward_time=1.244, grad_norm=89.490, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.101e-04, train_time=3.149 +[gpub015:0/64] 2023-07-04 20:07:10,509 (trainer:732) INFO: 12epoch:train:3101-3200batch: iter_time=1.079e-04, forward_time=0.144, loss_ctc=77.433, loss_att=60.732, acc=0.681, loss=65.743, backward_time=1.242, grad_norm=105.819, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.101e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 20:09:47,663 (trainer:732) INFO: 12epoch:train:3201-3300batch: iter_time=1.169e-04, forward_time=0.145, loss_ctc=77.566, loss_att=60.190, acc=0.678, loss=65.403, backward_time=1.242, grad_norm=127.184, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.100e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 20:10:48,102 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub015:0/64] 2023-07-04 20:11:06,164 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 20:11:09,520 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 20:11:09,520 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, +[gpub015:0/64] 2023-07-04 20:11:09,526 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 20:16:56,469 (trainer:732) INFO: 12epoch:train:3301-3400batch: iter_time=1.721, forward_time=0.145, loss_ctc=81.802, loss_att=68.440, acc=0.662, loss=72.449, backward_time=1.251, grad_norm=120.608, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.100e-04, train_time=8.576 +[gpub015:0/64] 2023-07-04 20:19:33,743 (trainer:732) INFO: 12epoch:train:3401-3500batch: iter_time=1.214e-04, forward_time=0.144, loss_ctc=76.097, loss_att=57.732, acc=0.683, loss=63.241, backward_time=1.240, grad_norm=86.251, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.099e-04, train_time=3.145 +[gpub015:0/64] 2023-07-04 20:22:10,967 (trainer:732) INFO: 12epoch:train:3501-3600batch: iter_time=1.174e-04, forward_time=0.144, loss_ctc=62.531, loss_att=47.624, acc=0.692, loss=52.096, backward_time=1.240, grad_norm=76.543, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.099e-04, train_time=3.144 +[gpub015:0/64] 2023-07-04 20:24:48,258 (trainer:732) INFO: 12epoch:train:3601-3700batch: iter_time=1.147e-04, forward_time=0.145, loss_ctc=82.753, loss_att=73.085, acc=0.642, loss=75.986, backward_time=1.242, grad_norm=93.445, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.098e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 20:27:25,282 (trainer:732) INFO: 12epoch:train:3701-3800batch: iter_time=1.122e-04, forward_time=0.144, loss_ctc=79.961, loss_att=63.845, acc=0.651, loss=68.680, backward_time=1.241, grad_norm=94.879, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.181, optim0_lr0=1.098e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 20:30:02,221 (trainer:732) INFO: 12epoch:train:3801-3900batch: iter_time=1.218e-04, forward_time=0.145, loss_ctc=70.155, loss_att=59.692, acc=0.663, loss=62.831, backward_time=1.241, grad_norm=81.352, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.097e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 20:32:39,167 (trainer:732) INFO: 12epoch:train:3901-4000batch: iter_time=1.407e-04, forward_time=0.144, loss_ctc=80.943, loss_att=62.078, acc=0.682, loss=67.738, backward_time=1.242, grad_norm=92.320, clip=100.000, loss_scale=2.199e+12, optim_step_time=0.182, optim0_lr0=1.097e-04, train_time=3.139 +[gpub015:0/64] 2023-07-04 20:35:16,227 (trainer:732) INFO: 12epoch:train:4001-4100batch: iter_time=1.208e-04, forward_time=0.145, loss_ctc=76.768, loss_att=59.276, acc=0.678, loss=64.524, backward_time=1.241, grad_norm=102.125, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.096e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 20:37:02,841 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub015:0/64] 2023-07-04 20:37:20,845 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 20:37:24,221 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.8", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 20:37:24,221 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.8, +[gpub015:0/64] 2023-07-04 20:37:24,227 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 20:41:28,761 (trainer:732) INFO: 12epoch:train:4101-4200batch: iter_time=1.213, forward_time=0.145, loss_ctc=82.650, loss_att=63.715, acc=0.660, loss=69.395, backward_time=1.255, grad_norm=108.665, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.096e-04, train_time=7.450 +[gpub015:0/64] 2023-07-04 20:44:06,299 (trainer:732) INFO: 12epoch:train:4201-4300batch: iter_time=1.138e-04, forward_time=0.145, loss_ctc=77.528, loss_att=61.049, acc=0.680, loss=65.993, backward_time=1.243, grad_norm=102.226, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.095e-04, train_time=3.151 +[gpub015:0/64] 2023-07-04 20:46:43,366 (trainer:732) INFO: 12epoch:train:4301-4400batch: iter_time=1.237e-04, forward_time=0.144, loss_ctc=63.974, loss_att=49.928, acc=0.703, loss=54.141, backward_time=1.241, grad_norm=85.946, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.094e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 20:49:20,761 (trainer:732) INFO: 12epoch:train:4401-4500batch: iter_time=1.296e-04, forward_time=0.145, loss_ctc=73.590, loss_att=63.027, acc=0.653, loss=66.196, backward_time=1.243, grad_norm=98.824, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.180, optim0_lr0=1.094e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 20:51:57,869 (trainer:732) INFO: 12epoch:train:4501-4600batch: iter_time=1.230e-04, forward_time=0.143, loss_ctc=86.089, loss_att=68.894, acc=0.647, loss=74.053, backward_time=1.241, grad_norm=98.658, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.180, optim0_lr0=1.093e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 20:54:34,865 (trainer:732) INFO: 12epoch:train:4601-4700batch: iter_time=1.307e-04, forward_time=0.144, loss_ctc=69.746, loss_att=61.107, acc=0.658, loss=63.699, backward_time=1.241, grad_norm=101.548, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.180, optim0_lr0=1.093e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 20:57:11,892 (trainer:732) INFO: 12epoch:train:4701-4800batch: iter_time=1.379e-04, forward_time=0.144, loss_ctc=72.222, loss_att=59.132, acc=0.676, loss=63.059, backward_time=1.240, grad_norm=95.294, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.092e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 20:59:49,193 (trainer:732) INFO: 12epoch:train:4801-4900batch: iter_time=1.299e-04, forward_time=0.144, loss_ctc=84.221, loss_att=67.911, acc=0.672, loss=72.804, backward_time=1.242, grad_norm=97.856, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.180, optim0_lr0=1.092e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 21:02:26,288 (trainer:732) INFO: 12epoch:train:4901-5000batch: iter_time=1.076e-04, forward_time=0.145, loss_ctc=77.713, loss_att=56.351, acc=0.676, loss=62.760, backward_time=1.241, grad_norm=93.021, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.180, optim0_lr0=1.091e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 21:02:29,142 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub015:0/64] 2023-07-04 21:02:47,188 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 21:02:50,583 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 21:02:50,583 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, +[gpub015:0/64] 2023-07-04 21:02:50,589 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 21:09:23,316 (trainer:732) INFO: 12epoch:train:5001-5100batch: iter_time=1.207, forward_time=0.146, loss_ctc=82.085, loss_att=70.491, acc=0.667, loss=73.969, backward_time=1.255, grad_norm=101.219, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.091e-04, train_time=8.340 +[gpub015:0/64] 2023-07-04 21:12:00,519 (trainer:732) INFO: 12epoch:train:5101-5200batch: iter_time=1.063e-04, forward_time=0.145, loss_ctc=70.631, loss_att=52.245, acc=0.702, loss=57.761, backward_time=1.241, grad_norm=82.535, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.090e-04, train_time=3.144 +[gpub015:0/64] 2023-07-04 21:14:37,398 (trainer:732) INFO: 12epoch:train:5201-5300batch: iter_time=1.102e-04, forward_time=0.144, loss_ctc=69.574, loss_att=59.873, acc=0.662, loss=62.783, backward_time=1.240, grad_norm=83.957, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.090e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 21:17:14,881 (trainer:732) INFO: 12epoch:train:5301-5400batch: iter_time=9.520e-05, forward_time=0.145, loss_ctc=80.138, loss_att=66.006, acc=0.645, loss=70.246, backward_time=1.242, grad_norm=92.850, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.089e-04, train_time=3.149 +[gpub015:0/64] 2023-07-04 21:19:51,908 (trainer:732) INFO: 12epoch:train:5401-5500batch: iter_time=9.896e-05, forward_time=0.144, loss_ctc=71.912, loss_att=58.300, acc=0.672, loss=62.384, backward_time=1.240, grad_norm=92.965, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.089e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 21:22:28,899 (trainer:732) INFO: 12epoch:train:5501-5600batch: iter_time=9.511e-05, forward_time=0.144, loss_ctc=74.006, loss_att=59.170, acc=0.672, loss=63.621, backward_time=1.241, grad_norm=83.684, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.088e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 21:25:05,777 (trainer:732) INFO: 12epoch:train:5601-5700batch: iter_time=9.316e-05, forward_time=0.143, loss_ctc=77.242, loss_att=61.429, acc=0.675, loss=66.173, backward_time=1.240, grad_norm=96.974, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.088e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 21:27:42,708 (trainer:732) INFO: 12epoch:train:5701-5800batch: iter_time=1.053e-04, forward_time=0.144, loss_ctc=77.205, loss_att=58.937, acc=0.675, loss=64.417, backward_time=1.240, grad_norm=105.464, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.087e-04, train_time=3.138 +[gpub015:0/64] 2023-07-04 21:28:38,964 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub015:0/64] 2023-07-04 21:28:56,893 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 21:29:00,277 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.5", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 21:29:00,278 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.5, +[gpub015:0/64] 2023-07-04 21:29:00,284 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 21:33:26,178 (trainer:732) INFO: 12epoch:train:5801-5900batch: iter_time=1.191, forward_time=0.146, loss_ctc=82.820, loss_att=69.559, acc=0.671, loss=73.537, backward_time=1.262, grad_norm=95.044, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.087e-04, train_time=6.869 +[gpub015:0/64] 2023-07-04 21:36:03,771 (trainer:732) INFO: 12epoch:train:5901-6000batch: iter_time=1.103e-04, forward_time=0.146, loss_ctc=75.434, loss_att=55.881, acc=0.695, loss=61.747, backward_time=1.242, grad_norm=85.721, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.086e-04, train_time=3.152 +[gpub015:0/64] 2023-07-04 21:38:40,878 (trainer:732) INFO: 12epoch:train:6001-6100batch: iter_time=1.150e-04, forward_time=0.146, loss_ctc=61.211, loss_att=46.498, acc=0.698, loss=50.912, backward_time=1.241, grad_norm=79.707, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.086e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 21:41:18,354 (trainer:732) INFO: 12epoch:train:6101-6200batch: iter_time=1.148e-04, forward_time=0.146, loss_ctc=81.359, loss_att=70.422, acc=0.664, loss=73.703, backward_time=1.243, grad_norm=100.635, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.085e-04, train_time=3.149 +[gpub015:0/64] 2023-07-04 21:43:55,529 (trainer:732) INFO: 12epoch:train:6201-6300batch: iter_time=1.147e-04, forward_time=0.146, loss_ctc=77.317, loss_att=64.201, acc=0.663, loss=68.136, backward_time=1.242, grad_norm=85.881, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.085e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 21:46:32,894 (trainer:732) INFO: 12epoch:train:6301-6400batch: iter_time=1.152e-04, forward_time=0.146, loss_ctc=69.240, loss_att=57.327, acc=0.672, loss=60.901, backward_time=1.241, grad_norm=90.918, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.084e-04, train_time=3.147 +[gpub015:0/64] 2023-07-04 21:49:10,067 (trainer:732) INFO: 12epoch:train:6401-6500batch: iter_time=1.151e-04, forward_time=0.146, loss_ctc=80.419, loss_att=61.954, acc=0.689, loss=67.493, backward_time=1.242, grad_norm=100.821, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.084e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 21:51:47,235 (trainer:732) INFO: 12epoch:train:6501-6600batch: iter_time=1.108e-04, forward_time=0.146, loss_ctc=76.050, loss_att=60.487, acc=0.687, loss=65.156, backward_time=1.242, grad_norm=88.698, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.083e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 21:53:35,633 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub015:0/64] 2023-07-04 21:53:54,004 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 21:53:57,417 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 21:53:57,417 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, +[gpub015:0/64] 2023-07-04 21:53:57,423 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 21:59:20,328 (trainer:732) INFO: 12epoch:train:6601-6700batch: iter_time=1.193, forward_time=0.147, loss_ctc=82.390, loss_att=62.650, acc=0.665, loss=68.572, backward_time=1.253, grad_norm=98.318, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.083e-04, train_time=9.062 +[gpub015:0/64] 2023-07-04 22:01:58,471 (trainer:732) INFO: 12epoch:train:6701-6800batch: iter_time=1.162e-04, forward_time=0.144, loss_ctc=76.609, loss_att=61.085, acc=0.679, loss=65.742, backward_time=1.244, grad_norm=106.235, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.082e-04, train_time=3.163 +[gpub015:0/64] 2023-07-04 22:04:35,355 (trainer:732) INFO: 12epoch:train:6801-6900batch: iter_time=1.242e-04, forward_time=0.143, loss_ctc=64.435, loss_att=50.254, acc=0.706, loss=54.508, backward_time=1.240, grad_norm=81.368, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.082e-04, train_time=3.137 +[gpub015:0/64] 2023-07-04 22:07:12,751 (trainer:732) INFO: 12epoch:train:6901-7000batch: iter_time=1.163e-04, forward_time=0.144, loss_ctc=70.848, loss_att=61.231, acc=0.665, loss=64.116, backward_time=1.241, grad_norm=86.145, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.081e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 22:09:49,941 (trainer:732) INFO: 12epoch:train:7001-7100batch: iter_time=1.173e-04, forward_time=0.145, loss_ctc=83.009, loss_att=68.382, acc=0.652, loss=72.770, backward_time=1.243, grad_norm=95.705, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.081e-04, train_time=3.144 +[gpub015:0/64] 2023-07-04 22:12:26,941 (trainer:732) INFO: 12epoch:train:7101-7200batch: iter_time=9.946e-05, forward_time=0.144, loss_ctc=70.979, loss_att=56.996, acc=0.668, loss=61.190, backward_time=1.242, grad_norm=78.799, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.080e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 22:15:04,098 (trainer:732) INFO: 12epoch:train:7201-7300batch: iter_time=1.017e-04, forward_time=0.144, loss_ctc=70.377, loss_att=56.229, acc=0.680, loss=60.473, backward_time=1.242, grad_norm=84.014, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.080e-04, train_time=3.143 +[gpub015:0/64] 2023-07-04 22:17:41,598 (trainer:732) INFO: 12epoch:train:7301-7400batch: iter_time=9.492e-05, forward_time=0.144, loss_ctc=81.351, loss_att=66.341, acc=0.673, loss=70.844, backward_time=1.243, grad_norm=93.304, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.079e-04, train_time=3.150 +[gpub015:0/64] 2023-07-04 22:20:18,910 (trainer:732) INFO: 12epoch:train:7401-7500batch: iter_time=1.063e-04, forward_time=0.144, loss_ctc=78.117, loss_att=55.626, acc=0.691, loss=62.374, backward_time=1.242, grad_norm=96.027, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.079e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 22:20:21,637 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub015:0/64] 2023-07-04 22:20:39,406 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 22:20:42,788 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 22:20:42,788 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, +[gpub015:0/64] 2023-07-04 22:20:42,794 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 22:26:34,393 (trainer:732) INFO: 12epoch:train:7501-7600batch: iter_time=1.201, forward_time=0.146, loss_ctc=78.276, loss_att=65.547, acc=0.682, loss=69.366, backward_time=1.258, grad_norm=125.636, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.078e-04, train_time=7.509 +[gpub015:0/64] 2023-07-04 22:29:11,707 (trainer:732) INFO: 12epoch:train:7601-7700batch: iter_time=1.155e-04, forward_time=0.145, loss_ctc=69.956, loss_att=52.275, acc=0.704, loss=57.579, backward_time=1.241, grad_norm=86.535, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.078e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 22:31:48,758 (trainer:732) INFO: 12epoch:train:7701-7800batch: iter_time=1.089e-04, forward_time=0.145, loss_ctc=66.350, loss_att=56.272, acc=0.683, loss=59.295, backward_time=1.241, grad_norm=81.376, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.077e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 22:34:26,515 (trainer:732) INFO: 12epoch:train:7801-7900batch: iter_time=1.116e-04, forward_time=0.146, loss_ctc=81.444, loss_att=63.780, acc=0.666, loss=69.079, backward_time=1.243, grad_norm=82.021, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.182, optim0_lr0=1.077e-04, train_time=3.155 +[gpub015:0/64] 2023-07-04 22:37:03,817 (trainer:732) INFO: 12epoch:train:7901-8000batch: iter_time=1.259e-04, forward_time=0.146, loss_ctc=73.305, loss_att=58.553, acc=0.673, loss=62.979, backward_time=1.242, grad_norm=86.271, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.076e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 22:39:41,184 (trainer:732) INFO: 12epoch:train:8001-8100batch: iter_time=1.047e-04, forward_time=0.146, loss_ctc=72.842, loss_att=59.680, acc=0.680, loss=63.629, backward_time=1.243, grad_norm=83.633, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.076e-04, train_time=3.147 +[gpub015:0/64] 2023-07-04 22:42:18,296 (trainer:732) INFO: 12epoch:train:8101-8200batch: iter_time=1.052e-04, forward_time=0.145, loss_ctc=76.362, loss_att=60.168, acc=0.687, loss=65.026, backward_time=1.242, grad_norm=110.533, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.075e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 22:44:55,396 (trainer:732) INFO: 12epoch:train:8201-8300batch: iter_time=9.924e-05, forward_time=0.145, loss_ctc=76.992, loss_att=57.507, acc=0.689, loss=63.352, backward_time=1.242, grad_norm=87.809, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.075e-04, train_time=3.142 +[gpub015:0/64] 2023-07-04 22:45:50,214 (multiple_iter_factory:32) INFO: Building 10th iter-factory... +[gpub015:0/64] 2023-07-04 22:46:08,597 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 22:46:12,246 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 22:46:12,247 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, +[gpub015:0/64] 2023-07-04 22:46:12,253 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 22:51:21,478 (trainer:732) INFO: 12epoch:train:8301-8400batch: iter_time=1.205, forward_time=0.144, loss_ctc=80.365, loss_att=65.058, acc=0.674, loss=69.650, backward_time=1.253, grad_norm=106.325, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.074e-04, train_time=7.721 +[gpub015:0/64] 2023-07-04 22:53:59,133 (trainer:732) INFO: 12epoch:train:8401-8500batch: iter_time=1.152e-04, forward_time=0.145, loss_ctc=74.424, loss_att=55.889, acc=0.689, loss=61.450, backward_time=1.242, grad_norm=95.701, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.074e-04, train_time=3.153 +[gpub015:0/64] 2023-07-04 22:56:36,584 (trainer:732) INFO: 12epoch:train:8501-8600batch: iter_time=1.030e-04, forward_time=0.145, loss_ctc=61.774, loss_att=47.039, acc=0.701, loss=51.460, backward_time=1.242, grad_norm=81.038, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.073e-04, train_time=3.149 +[gpub015:0/64] 2023-07-04 22:59:13,839 (trainer:732) INFO: 12epoch:train:8601-8700batch: iter_time=1.063e-04, forward_time=0.145, loss_ctc=78.654, loss_att=69.938, acc=0.653, loss=72.552, backward_time=1.243, grad_norm=89.759, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.073e-04, train_time=3.145 +[gpub015:0/64] 2023-07-04 23:01:51,155 (trainer:732) INFO: 12epoch:train:8701-8800batch: iter_time=1.165e-04, forward_time=0.144, loss_ctc=77.945, loss_att=62.783, acc=0.660, loss=67.331, backward_time=1.241, grad_norm=83.799, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.072e-04, train_time=3.146 +[gpub015:0/64] 2023-07-04 23:04:28,142 (trainer:732) INFO: 12epoch:train:8801-8900batch: iter_time=1.221e-04, forward_time=0.144, loss_ctc=70.601, loss_att=58.052, acc=0.671, loss=61.817, backward_time=1.240, grad_norm=91.330, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.072e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 23:07:05,403 (trainer:732) INFO: 12epoch:train:8901-9000batch: iter_time=1.206e-04, forward_time=0.145, loss_ctc=80.327, loss_att=60.532, acc=0.690, loss=66.471, backward_time=1.242, grad_norm=96.076, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.071e-04, train_time=3.145 +[gpub015:0/64] 2023-07-04 23:09:42,434 (trainer:732) INFO: 12epoch:train:9001-9100batch: iter_time=1.162e-04, forward_time=0.144, loss_ctc=75.994, loss_att=58.252, acc=0.683, loss=63.575, backward_time=1.241, grad_norm=91.538, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.071e-04, train_time=3.140 +[gpub015:0/64] 2023-07-04 23:11:28,947 (multiple_iter_factory:32) INFO: Building 11th iter-factory... +[gpub015:0/64] 2023-07-04 23:11:46,760 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 23:11:50,122 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 23:11:50,122 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpub015:0/64] 2023-07-04 23:11:50,128 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 23:16:39,903 (trainer:732) INFO: 12epoch:train:9101-9200batch: iter_time=1.216, forward_time=0.145, loss_ctc=81.962, loss_att=63.013, acc=0.674, loss=68.697, backward_time=1.255, grad_norm=96.644, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.070e-04, train_time=8.349 +[gpub015:0/64] 2023-07-04 23:19:18,238 (trainer:732) INFO: 12epoch:train:9201-9300batch: iter_time=1.179e-04, forward_time=0.145, loss_ctc=76.818, loss_att=60.914, acc=0.689, loss=65.685, backward_time=1.244, grad_norm=97.394, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.070e-04, train_time=3.166 +[gpub015:0/64] 2023-07-04 23:21:55,891 (trainer:732) INFO: 12epoch:train:9301-9400batch: iter_time=1.133e-04, forward_time=0.145, loss_ctc=64.491, loss_att=48.491, acc=0.714, loss=53.291, backward_time=1.242, grad_norm=103.178, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.069e-04, train_time=3.153 +[gpub015:0/64] 2023-07-04 23:24:33,316 (trainer:732) INFO: 12epoch:train:9401-9500batch: iter_time=1.123e-04, forward_time=0.146, loss_ctc=71.079, loss_att=60.571, acc=0.676, loss=63.723, backward_time=1.242, grad_norm=96.268, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.069e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 23:27:10,748 (trainer:732) INFO: 12epoch:train:9501-9600batch: iter_time=1.076e-04, forward_time=0.146, loss_ctc=83.740, loss_att=65.360, acc=0.670, loss=70.874, backward_time=1.243, grad_norm=101.596, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.068e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 23:29:48,320 (trainer:732) INFO: 12epoch:train:9601-9700batch: iter_time=1.198e-04, forward_time=0.146, loss_ctc=68.537, loss_att=55.883, acc=0.678, loss=59.679, backward_time=1.243, grad_norm=91.222, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.068e-04, train_time=3.151 +[gpub015:0/64] 2023-07-04 23:32:25,785 (trainer:732) INFO: 12epoch:train:9701-9800batch: iter_time=1.094e-04, forward_time=0.146, loss_ctc=71.251, loss_att=56.795, acc=0.683, loss=61.132, backward_time=1.242, grad_norm=95.223, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.067e-04, train_time=3.149 +[gpub015:0/64] 2023-07-04 23:35:03,217 (trainer:732) INFO: 12epoch:train:9801-9900batch: iter_time=1.105e-04, forward_time=0.145, loss_ctc=82.689, loss_att=66.938, acc=0.684, loss=71.663, backward_time=1.243, grad_norm=92.914, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.067e-04, train_time=3.148 +[gpub015:0/64] 2023-07-04 23:37:40,257 (trainer:732) INFO: 12epoch:train:9901-10000batch: iter_time=1.088e-04, forward_time=0.145, loss_ctc=76.321, loss_att=55.700, acc=0.689, loss=61.887, backward_time=1.240, grad_norm=106.101, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.066e-04, train_time=3.141 +[gpub015:0/64] 2023-07-04 23:50:19,270 (trainer:338) INFO: 12epoch results: [train] iter_time=0.165, forward_time=0.146, loss_ctc=76.028, loss_att=60.678, acc=0.674, loss=65.283, backward_time=1.244, grad_norm=95.160, clip=100.000, loss_scale=4.398e+12, optim_step_time=0.181, optim0_lr0=1.091e-04, train_time=3.698, time=5 hours, 8 minutes and 28.51 seconds, total_count=90000, gpu_max_cached_mem_GB=37.139, [valid] loss_ctc=56.126, cer_ctc=0.306, loss_att=45.911, acc=0.623, cer=0.448, wer=0.996, loss=48.975, time=6 minutes and 37.11 seconds, total_count=9614, gpu_max_cached_mem_GB=37.139, [att_plot] time=5 minutes and 46.68 seconds, total_count=0, gpu_max_cached_mem_GB=37.139 +[gpub015:0/64] 2023-07-04 23:50:34,276 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub015:0/64] 2023-07-04 23:50:34,316 (trainer:440) INFO: The model files were removed: exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/7epoch.pth +[gpub015:0/64] 2023-07-04 23:50:34,317 (trainer:272) INFO: 13/100epoch started. Estimated time to finish: 2 weeks, 5 days and 13 hours +[gpub015:0/64] 2023-07-04 23:50:34,320 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub015:0/64] 2023-07-04 23:50:51,721 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-04 23:50:55,289 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-04 23:50:55,289 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, +[gpub015:0/64] 2023-07-04 23:50:55,296 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-04 23:55:20,069 (trainer:732) INFO: 13epoch:train:1-100batch: iter_time=1.207, forward_time=0.145, loss_ctc=85.987, loss_att=70.558, acc=0.640, loss=75.187, backward_time=1.257, grad_norm=124.267, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.066e-04, train_time=5.715 +[gpub015:0/64] 2023-07-04 23:58:04,071 (trainer:732) INFO: 13epoch:train:101-200batch: iter_time=0.006, forward_time=0.184, loss_ctc=73.151, loss_att=51.368, acc=0.686, loss=57.903, backward_time=1.255, grad_norm=88.292, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.182, optim0_lr0=1.065e-04, train_time=3.280 +[gpub015:0/64] 2023-07-05 00:00:49,379 (trainer:732) INFO: 13epoch:train:201-300batch: iter_time=1.286e-04, forward_time=0.174, loss_ctc=83.595, loss_att=65.113, acc=0.656, loss=70.658, backward_time=1.252, grad_norm=113.517, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.183, optim0_lr0=1.065e-04, train_time=3.306 +[gpub015:0/64] 2023-07-05 00:03:30,808 (trainer:732) INFO: 13epoch:train:301-400batch: iter_time=1.342e-04, forward_time=0.152, loss_ctc=84.466, loss_att=68.319, acc=0.661, loss=73.163, backward_time=1.243, grad_norm=105.019, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.182, optim0_lr0=1.064e-04, train_time=3.228 +[gpub015:0/64] 2023-07-05 00:06:08,085 (trainer:732) INFO: 13epoch:train:401-500batch: iter_time=1.279e-04, forward_time=0.144, loss_ctc=88.619, loss_att=76.517, acc=0.651, loss=80.147, backward_time=1.243, grad_norm=109.872, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.180, optim0_lr0=1.064e-04, train_time=3.145 +[gpub015:0/64] 2023-07-05 00:08:48,273 (trainer:732) INFO: 13epoch:train:501-600batch: iter_time=1.310e-04, forward_time=0.144, loss_ctc=78.073, loss_att=59.874, acc=0.665, loss=65.334, backward_time=1.243, grad_norm=157.357, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.063e-04, train_time=3.204 +[gpub015:0/64] 2023-07-05 00:11:32,992 (trainer:732) INFO: 13epoch:train:601-700batch: iter_time=1.264e-04, forward_time=0.143, loss_ctc=77.084, loss_att=60.101, acc=0.662, loss=65.196, backward_time=1.245, grad_norm=98.957, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.180, optim0_lr0=1.063e-04, train_time=3.294 +[gpub015:0/64] 2023-07-05 00:14:10,177 (trainer:732) INFO: 13epoch:train:701-800batch: iter_time=1.302e-04, forward_time=0.144, loss_ctc=72.641, loss_att=55.254, acc=0.691, loss=60.470, backward_time=1.241, grad_norm=97.430, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.062e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 00:15:19,090 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub015:0/64] 2023-07-05 00:15:36,248 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 00:15:39,840 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 00:15:39,840 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpub015:0/64] 2023-07-05 00:15:39,846 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 00:21:58,716 (trainer:732) INFO: 13epoch:train:801-900batch: iter_time=1.834, forward_time=0.144, loss_ctc=75.899, loss_att=62.053, acc=0.682, loss=66.207, backward_time=1.249, grad_norm=96.235, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.062e-04, train_time=9.371 +[gpub015:0/64] 2023-07-05 00:24:36,279 (trainer:732) INFO: 13epoch:train:901-1000batch: iter_time=1.032e-04, forward_time=0.144, loss_ctc=78.048, loss_att=61.186, acc=0.667, loss=66.245, backward_time=1.242, grad_norm=122.905, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.061e-04, train_time=3.151 +[gpub015:0/64] 2023-07-05 00:27:13,670 (trainer:732) INFO: 13epoch:train:1001-1100batch: iter_time=1.110e-04, forward_time=0.144, loss_ctc=76.879, loss_att=56.861, acc=0.686, loss=62.866, backward_time=1.242, grad_norm=98.500, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.061e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 00:29:50,859 (trainer:732) INFO: 13epoch:train:1101-1200batch: iter_time=1.162e-04, forward_time=0.145, loss_ctc=85.818, loss_att=67.227, acc=0.673, loss=72.804, backward_time=1.243, grad_norm=118.579, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.060e-04, train_time=3.144 +[gpub015:0/64] 2023-07-05 00:32:28,234 (trainer:732) INFO: 13epoch:train:1201-1300batch: iter_time=1.191e-04, forward_time=0.146, loss_ctc=85.561, loss_att=71.717, acc=0.666, loss=75.871, backward_time=1.245, grad_norm=103.403, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.060e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 00:35:05,850 (trainer:732) INFO: 13epoch:train:1301-1400batch: iter_time=1.150e-04, forward_time=0.146, loss_ctc=85.789, loss_att=70.197, acc=0.673, loss=74.875, backward_time=1.245, grad_norm=113.813, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.060e-04, train_time=3.152 +[gpub015:0/64] 2023-07-05 00:37:42,827 (trainer:732) INFO: 13epoch:train:1401-1500batch: iter_time=1.053e-04, forward_time=0.144, loss_ctc=76.540, loss_att=55.910, acc=0.667, loss=62.099, backward_time=1.240, grad_norm=91.779, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.059e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 00:40:20,163 (trainer:732) INFO: 13epoch:train:1501-1600batch: iter_time=1.059e-04, forward_time=0.145, loss_ctc=74.583, loss_att=60.149, acc=0.685, loss=64.479, backward_time=1.243, grad_norm=88.474, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.059e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 00:42:12,133 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub015:0/64] 2023-07-05 00:42:30,034 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 00:42:33,425 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.5", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 00:42:33,425 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.5, +[gpub015:0/64] 2023-07-05 00:42:33,463 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 00:46:46,084 (trainer:732) INFO: 13epoch:train:1601-1700batch: iter_time=1.791, forward_time=0.145, loss_ctc=74.553, loss_att=58.445, acc=0.700, loss=63.277, backward_time=1.251, grad_norm=111.557, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.058e-04, train_time=7.718 +[gpub015:0/64] 2023-07-05 00:49:24,208 (trainer:732) INFO: 13epoch:train:1701-1800batch: iter_time=1.166e-04, forward_time=0.145, loss_ctc=75.927, loss_att=60.423, acc=0.666, loss=65.074, backward_time=1.244, grad_norm=134.424, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.058e-04, train_time=3.162 +[gpub015:0/64] 2023-07-05 00:52:01,356 (trainer:732) INFO: 13epoch:train:1801-1900batch: iter_time=1.104e-04, forward_time=0.144, loss_ctc=77.363, loss_att=57.785, acc=0.684, loss=63.658, backward_time=1.242, grad_norm=94.176, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.057e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 00:54:38,891 (trainer:732) INFO: 13epoch:train:1901-2000batch: iter_time=1.082e-04, forward_time=0.145, loss_ctc=78.779, loss_att=62.722, acc=0.687, loss=67.539, backward_time=1.243, grad_norm=96.279, clip=100.000, loss_scale=8.796e+12, optim_step_time=0.181, optim0_lr0=1.057e-04, train_time=3.150 +[gpub015:0/64] 2023-07-05 00:57:18,535 (trainer:732) INFO: 13epoch:train:2001-2100batch: iter_time=1.124e-04, forward_time=0.145, loss_ctc=83.142, loss_att=62.800, acc=0.676, loss=68.903, backward_time=1.244, grad_norm=93.127, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.056e-04, train_time=3.193 +[gpub015:0/64] 2023-07-05 01:00:10,027 (trainer:732) INFO: 13epoch:train:2101-2200batch: iter_time=1.114e-04, forward_time=0.145, loss_ctc=87.896, loss_att=72.933, acc=0.670, loss=77.422, backward_time=1.296, grad_norm=109.444, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.056e-04, train_time=3.430 +[gpub015:0/64] 2023-07-05 01:02:47,356 (trainer:732) INFO: 13epoch:train:2201-2300batch: iter_time=1.144e-04, forward_time=0.144, loss_ctc=76.474, loss_att=59.030, acc=0.674, loss=64.263, backward_time=1.243, grad_norm=92.859, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.055e-04, train_time=3.146 +[gpub015:0/64] 2023-07-05 01:05:27,710 (trainer:732) INFO: 13epoch:train:2301-2400batch: iter_time=1.066e-04, forward_time=0.144, loss_ctc=76.529, loss_att=59.750, acc=0.682, loss=64.784, backward_time=1.244, grad_norm=89.961, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.055e-04, train_time=3.207 +[gpub015:0/64] 2023-07-05 01:08:04,714 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub015:0/64] 2023-07-05 01:08:22,388 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 01:08:25,735 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 01:08:25,735 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, +[gpub015:0/64] 2023-07-05 01:08:25,742 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 01:13:37,098 (trainer:732) INFO: 13epoch:train:2401-2500batch: iter_time=1.192, forward_time=0.144, loss_ctc=70.399, loss_att=53.578, acc=0.702, loss=58.624, backward_time=1.244, grad_norm=79.956, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.054e-04, train_time=9.788 +[gpub015:0/64] 2023-07-05 01:16:16,640 (trainer:732) INFO: 13epoch:train:2501-2600batch: iter_time=1.207e-04, forward_time=0.145, loss_ctc=77.710, loss_att=67.795, acc=0.656, loss=70.770, backward_time=1.249, grad_norm=156.311, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.182, optim0_lr0=1.054e-04, train_time=3.191 +[gpub015:0/64] 2023-07-05 01:18:53,489 (trainer:732) INFO: 13epoch:train:2601-2700batch: iter_time=1.184e-04, forward_time=0.144, loss_ctc=71.797, loss_att=51.023, acc=0.687, loss=57.255, backward_time=1.240, grad_norm=156.086, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.053e-04, train_time=3.137 +[gpub015:0/64] 2023-07-05 01:21:30,516 (trainer:732) INFO: 13epoch:train:2701-2800batch: iter_time=1.030e-04, forward_time=0.144, loss_ctc=78.373, loss_att=62.065, acc=0.669, loss=66.957, backward_time=1.241, grad_norm=95.962, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.053e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 01:24:07,692 (trainer:732) INFO: 13epoch:train:2801-2900batch: iter_time=9.315e-05, forward_time=0.143, loss_ctc=83.675, loss_att=66.663, acc=0.670, loss=71.766, backward_time=1.242, grad_norm=96.929, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.052e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 01:26:45,090 (trainer:732) INFO: 13epoch:train:2901-3000batch: iter_time=1.043e-04, forward_time=0.144, loss_ctc=86.928, loss_att=75.665, acc=0.659, loss=79.044, backward_time=1.243, grad_norm=119.641, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.052e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 01:29:22,331 (trainer:732) INFO: 13epoch:train:3001-3100batch: iter_time=9.929e-05, forward_time=0.144, loss_ctc=74.752, loss_att=55.415, acc=0.672, loss=61.216, backward_time=1.242, grad_norm=108.366, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.052e-04, train_time=3.145 +[gpub015:0/64] 2023-07-05 01:31:59,171 (trainer:732) INFO: 13epoch:train:3101-3200batch: iter_time=9.663e-05, forward_time=0.143, loss_ctc=75.329, loss_att=58.040, acc=0.669, loss=63.226, backward_time=1.239, grad_norm=93.650, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.051e-04, train_time=3.137 +[gpub015:0/64] 2023-07-05 01:34:36,282 (trainer:732) INFO: 13epoch:train:3201-3300batch: iter_time=1.020e-04, forward_time=0.143, loss_ctc=71.885, loss_att=54.805, acc=0.700, loss=59.929, backward_time=1.241, grad_norm=90.742, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.051e-04, train_time=3.142 +[gpub015:0/64] 2023-07-05 01:35:27,815 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub015:0/64] 2023-07-05 01:35:46,394 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 01:35:49,849 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 01:35:49,849 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, +[gpub015:0/64] 2023-07-05 01:35:49,855 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 01:42:13,884 (trainer:732) INFO: 13epoch:train:3301-3400batch: iter_time=1.232, forward_time=0.144, loss_ctc=75.439, loss_att=63.828, acc=0.678, loss=67.311, backward_time=1.257, grad_norm=94.817, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.050e-04, train_time=9.152 +[gpub015:0/64] 2023-07-05 01:44:51,700 (trainer:732) INFO: 13epoch:train:3401-3500batch: iter_time=1.174e-04, forward_time=0.145, loss_ctc=74.146, loss_att=54.359, acc=0.677, loss=60.295, backward_time=1.243, grad_norm=122.599, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.050e-04, train_time=3.156 +[gpub015:0/64] 2023-07-05 01:47:28,958 (trainer:732) INFO: 13epoch:train:3501-3600batch: iter_time=1.050e-04, forward_time=0.145, loss_ctc=73.295, loss_att=55.148, acc=0.682, loss=60.592, backward_time=1.243, grad_norm=88.590, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.049e-04, train_time=3.145 +[gpub015:0/64] 2023-07-05 01:50:06,157 (trainer:732) INFO: 13epoch:train:3601-3700batch: iter_time=1.063e-04, forward_time=0.145, loss_ctc=83.897, loss_att=65.933, acc=0.670, loss=71.322, backward_time=1.242, grad_norm=103.744, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.049e-04, train_time=3.144 +[gpub015:0/64] 2023-07-05 01:52:43,823 (trainer:732) INFO: 13epoch:train:3701-3800batch: iter_time=1.084e-04, forward_time=0.144, loss_ctc=83.779, loss_att=69.293, acc=0.660, loss=73.639, backward_time=1.243, grad_norm=108.364, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.048e-04, train_time=3.153 +[gpub015:0/64] 2023-07-05 01:55:26,107 (trainer:732) INFO: 13epoch:train:3801-3900batch: iter_time=1.207e-04, forward_time=0.144, loss_ctc=82.365, loss_att=67.557, acc=0.668, loss=71.999, backward_time=1.247, grad_norm=101.347, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.048e-04, train_time=3.245 +[gpub015:0/64] 2023-07-05 01:58:04,045 (trainer:732) INFO: 13epoch:train:3901-4000batch: iter_time=1.078e-04, forward_time=0.144, loss_ctc=74.649, loss_att=55.654, acc=0.670, loss=61.352, backward_time=1.243, grad_norm=98.467, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.182, optim0_lr0=1.047e-04, train_time=3.159 +[gpub015:0/64] 2023-07-05 02:00:41,402 (trainer:732) INFO: 13epoch:train:4001-4100batch: iter_time=1.069e-04, forward_time=0.146, loss_ctc=73.796, loss_att=60.178, acc=0.683, loss=64.263, backward_time=1.243, grad_norm=95.938, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.047e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 02:02:24,654 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub015:0/64] 2023-07-05 02:02:42,647 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 02:02:46,089 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 02:02:46,089 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, +[gpub015:0/64] 2023-07-05 02:02:46,095 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 02:07:52,811 (trainer:732) INFO: 13epoch:train:4101-4200batch: iter_time=1.202, forward_time=0.144, loss_ctc=74.415, loss_att=55.435, acc=0.692, loss=61.129, backward_time=1.254, grad_norm=107.073, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.046e-04, train_time=8.628 +[gpub015:0/64] 2023-07-05 02:10:30,964 (trainer:732) INFO: 13epoch:train:4201-4300batch: iter_time=1.046e-04, forward_time=0.144, loss_ctc=72.424, loss_att=58.787, acc=0.670, loss=62.878, backward_time=1.242, grad_norm=122.387, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.046e-04, train_time=3.163 +[gpub015:0/64] 2023-07-05 02:13:08,002 (trainer:732) INFO: 13epoch:train:4301-4400batch: iter_time=1.043e-04, forward_time=0.144, loss_ctc=75.996, loss_att=56.412, acc=0.673, loss=62.287, backward_time=1.240, grad_norm=92.333, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.046e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 02:15:45,069 (trainer:732) INFO: 13epoch:train:4401-4500batch: iter_time=1.084e-04, forward_time=0.145, loss_ctc=77.586, loss_att=60.651, acc=0.678, loss=65.732, backward_time=1.241, grad_norm=94.112, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.045e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 02:18:22,044 (trainer:732) INFO: 13epoch:train:4501-4600batch: iter_time=1.049e-04, forward_time=0.144, loss_ctc=82.242, loss_att=64.662, acc=0.670, loss=69.936, backward_time=1.240, grad_norm=115.961, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.045e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 02:20:59,514 (trainer:732) INFO: 13epoch:train:4601-4700batch: iter_time=1.072e-04, forward_time=0.145, loss_ctc=92.408, loss_att=76.645, acc=0.664, loss=81.374, backward_time=1.244, grad_norm=106.089, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.044e-04, train_time=3.149 +[gpub015:0/64] 2023-07-05 02:23:36,368 (trainer:732) INFO: 13epoch:train:4701-4800batch: iter_time=1.045e-04, forward_time=0.144, loss_ctc=69.859, loss_att=51.347, acc=0.673, loss=56.901, backward_time=1.239, grad_norm=89.948, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.044e-04, train_time=3.137 +[gpub015:0/64] 2023-07-05 02:26:14,585 (trainer:732) INFO: 13epoch:train:4801-4900batch: iter_time=1.006e-04, forward_time=0.145, loss_ctc=77.996, loss_att=63.746, acc=0.677, loss=68.021, backward_time=1.244, grad_norm=88.365, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.043e-04, train_time=3.164 +[gpub015:0/64] 2023-07-05 02:28:53,321 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub015:0/64] 2023-07-05 02:29:11,386 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 02:29:15,170 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 02:29:15,170 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, +[gpub015:0/64] 2023-07-05 02:29:15,176 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 02:32:58,371 (trainer:732) INFO: 13epoch:train:4901-5000batch: iter_time=1.213, forward_time=0.144, loss_ctc=65.774, loss_att=47.844, acc=0.702, loss=53.223, backward_time=1.251, grad_norm=75.759, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.182, optim0_lr0=1.043e-04, train_time=8.075 +[gpub015:0/64] 2023-07-05 02:35:38,624 (trainer:732) INFO: 13epoch:train:5001-5100batch: iter_time=9.429e-05, forward_time=0.147, loss_ctc=76.403, loss_att=64.562, acc=0.673, loss=68.115, backward_time=1.250, grad_norm=106.879, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.182, optim0_lr0=1.042e-04, train_time=3.205 +[gpub015:0/64] 2023-07-05 02:38:15,647 (trainer:732) INFO: 13epoch:train:5101-5200batch: iter_time=9.749e-05, forward_time=0.145, loss_ctc=71.952, loss_att=51.831, acc=0.688, loss=57.867, backward_time=1.241, grad_norm=89.609, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.042e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 02:40:52,928 (trainer:732) INFO: 13epoch:train:5201-5300batch: iter_time=9.622e-05, forward_time=0.145, loss_ctc=76.889, loss_att=62.062, acc=0.679, loss=66.510, backward_time=1.243, grad_norm=98.013, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.182, optim0_lr0=1.041e-04, train_time=3.145 +[gpub015:0/64] 2023-07-05 02:43:30,305 (trainer:732) INFO: 13epoch:train:5301-5400batch: iter_time=9.506e-05, forward_time=0.146, loss_ctc=83.260, loss_att=66.659, acc=0.681, loss=71.639, backward_time=1.244, grad_norm=97.815, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.041e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 02:46:07,967 (trainer:732) INFO: 13epoch:train:5401-5500batch: iter_time=9.793e-05, forward_time=0.145, loss_ctc=87.108, loss_att=75.168, acc=0.671, loss=78.750, backward_time=1.246, grad_norm=104.971, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.041e-04, train_time=3.153 +[gpub015:0/64] 2023-07-05 02:48:45,133 (trainer:732) INFO: 13epoch:train:5501-5600batch: iter_time=1.216e-04, forward_time=0.145, loss_ctc=73.208, loss_att=55.122, acc=0.679, loss=60.548, backward_time=1.241, grad_norm=98.135, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.040e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 02:51:22,309 (trainer:732) INFO: 13epoch:train:5601-5700batch: iter_time=1.200e-04, forward_time=0.145, loss_ctc=73.220, loss_att=56.231, acc=0.682, loss=61.328, backward_time=1.242, grad_norm=88.409, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.040e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 02:53:59,660 (trainer:732) INFO: 13epoch:train:5701-5800batch: iter_time=1.222e-04, forward_time=0.145, loss_ctc=71.839, loss_att=55.799, acc=0.701, loss=60.611, backward_time=1.242, grad_norm=86.829, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.039e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 02:54:51,451 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub015:0/64] 2023-07-05 02:55:09,616 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 02:55:12,984 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 02:55:12,984 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, +[gpub015:0/64] 2023-07-05 02:55:12,991 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 03:00:53,451 (trainer:732) INFO: 13epoch:train:5801-5900batch: iter_time=1.204, forward_time=0.145, loss_ctc=74.023, loss_att=61.691, acc=0.691, loss=65.390, backward_time=1.254, grad_norm=93.924, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.039e-04, train_time=8.276 +[gpub015:0/64] 2023-07-05 03:03:31,383 (trainer:732) INFO: 13epoch:train:5901-6000batch: iter_time=1.073e-04, forward_time=0.145, loss_ctc=73.747, loss_att=54.734, acc=0.677, loss=60.438, backward_time=1.243, grad_norm=92.066, clip=100.000, loss_scale=1.759e+13, optim_step_time=0.181, optim0_lr0=1.038e-04, train_time=3.158 +[gpub015:0/64] 2023-07-05 03:06:08,513 (trainer:732) INFO: 13epoch:train:6001-6100batch: iter_time=1.025e-04, forward_time=0.143, loss_ctc=73.596, loss_att=56.085, acc=0.692, loss=61.338, backward_time=1.242, grad_norm=87.934, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.038e-04, train_time=3.142 +[gpub015:0/64] 2023-07-05 03:08:45,546 (trainer:732) INFO: 13epoch:train:6101-6200batch: iter_time=9.164e-05, forward_time=0.143, loss_ctc=82.621, loss_att=65.377, acc=0.684, loss=70.550, backward_time=1.242, grad_norm=93.644, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.037e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 03:11:22,948 (trainer:732) INFO: 13epoch:train:6201-6300batch: iter_time=9.824e-05, forward_time=0.144, loss_ctc=83.357, loss_att=69.309, acc=0.670, loss=73.524, backward_time=1.244, grad_norm=95.890, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.037e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 03:14:00,619 (trainer:732) INFO: 13epoch:train:6301-6400batch: iter_time=9.997e-05, forward_time=0.145, loss_ctc=81.690, loss_att=66.899, acc=0.678, loss=71.336, backward_time=1.245, grad_norm=112.957, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.036e-04, train_time=3.153 +[gpub015:0/64] 2023-07-05 03:16:37,757 (trainer:732) INFO: 13epoch:train:6401-6500batch: iter_time=1.112e-04, forward_time=0.145, loss_ctc=73.800, loss_att=53.459, acc=0.684, loss=59.561, backward_time=1.242, grad_norm=92.890, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.036e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 03:19:14,828 (trainer:732) INFO: 13epoch:train:6501-6600batch: iter_time=1.123e-04, forward_time=0.144, loss_ctc=74.459, loss_att=61.204, acc=0.684, loss=65.181, backward_time=1.241, grad_norm=87.396, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.036e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 03:20:57,945 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub015:0/64] 2023-07-05 03:21:15,656 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 03:21:19,045 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 03:21:19,045 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, +[gpub015:0/64] 2023-07-05 03:21:19,052 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 03:25:54,443 (trainer:732) INFO: 13epoch:train:6601-6700batch: iter_time=1.218, forward_time=0.144, loss_ctc=73.140, loss_att=52.638, acc=0.707, loss=58.789, backward_time=1.251, grad_norm=94.399, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.035e-04, train_time=7.992 +[gpub015:0/64] 2023-07-05 03:28:44,608 (trainer:732) INFO: 13epoch:train:6701-6800batch: iter_time=1.096e-04, forward_time=0.144, loss_ctc=73.678, loss_att=58.389, acc=0.673, loss=62.975, backward_time=1.260, grad_norm=107.744, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.035e-04, train_time=3.403 +[gpub015:0/64] 2023-07-05 03:31:30,183 (trainer:732) INFO: 13epoch:train:6801-6900batch: iter_time=1.042e-04, forward_time=0.144, loss_ctc=73.706, loss_att=55.405, acc=0.686, loss=60.895, backward_time=1.247, grad_norm=96.906, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.034e-04, train_time=3.311 +[gpub015:0/64] 2023-07-05 03:34:08,360 (trainer:732) INFO: 13epoch:train:6901-7000batch: iter_time=1.082e-04, forward_time=0.144, loss_ctc=78.905, loss_att=61.541, acc=0.686, loss=66.750, backward_time=1.242, grad_norm=102.781, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.034e-04, train_time=3.163 +[gpub015:0/64] 2023-07-05 03:36:45,724 (trainer:732) INFO: 13epoch:train:7001-7100batch: iter_time=1.027e-04, forward_time=0.144, loss_ctc=82.124, loss_att=65.694, acc=0.673, loss=70.623, backward_time=1.242, grad_norm=99.630, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.033e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 03:39:23,628 (trainer:732) INFO: 13epoch:train:7101-7200batch: iter_time=1.048e-04, forward_time=0.147, loss_ctc=88.303, loss_att=75.234, acc=0.673, loss=79.155, backward_time=1.246, grad_norm=106.563, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.033e-04, train_time=3.158 +[gpub015:0/64] 2023-07-05 03:42:01,838 (trainer:732) INFO: 13epoch:train:7201-7300batch: iter_time=1.028e-04, forward_time=0.144, loss_ctc=68.219, loss_att=50.582, acc=0.684, loss=55.873, backward_time=1.243, grad_norm=81.465, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.033e-04, train_time=3.164 +[gpub015:0/64] 2023-07-05 03:44:39,356 (trainer:732) INFO: 13epoch:train:7301-7400batch: iter_time=1.055e-04, forward_time=0.145, loss_ctc=77.835, loss_att=62.872, acc=0.687, loss=67.360, backward_time=1.244, grad_norm=87.557, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.032e-04, train_time=3.150 +[gpub015:0/64] 2023-07-05 03:47:15,904 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub015:0/64] 2023-07-05 03:47:33,872 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 03:47:37,276 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 03:47:37,276 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, +[gpub015:0/64] 2023-07-05 03:47:37,282 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 03:50:05,764 (trainer:732) INFO: 13epoch:train:7401-7500batch: iter_time=1.192, forward_time=0.144, loss_ctc=65.662, loss_att=47.694, acc=0.711, loss=53.085, backward_time=1.246, grad_norm=79.520, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.032e-04, train_time=6.528 +[gpub015:0/64] 2023-07-05 03:52:45,276 (trainer:732) INFO: 13epoch:train:7501-7600batch: iter_time=9.605e-05, forward_time=0.145, loss_ctc=70.791, loss_att=58.658, acc=0.672, loss=62.298, backward_time=1.247, grad_norm=107.372, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.031e-04, train_time=3.190 +[gpub015:0/64] 2023-07-05 03:55:23,506 (trainer:732) INFO: 13epoch:train:7601-7700batch: iter_time=1.037e-04, forward_time=0.145, loss_ctc=72.470, loss_att=52.026, acc=0.694, loss=58.159, backward_time=1.243, grad_norm=94.745, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.031e-04, train_time=3.164 +[gpub015:0/64] 2023-07-05 03:58:01,078 (trainer:732) INFO: 13epoch:train:7701-7800batch: iter_time=9.728e-05, forward_time=0.144, loss_ctc=80.052, loss_att=60.305, acc=0.687, loss=66.229, backward_time=1.241, grad_norm=113.321, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.030e-04, train_time=3.151 +[gpub015:0/64] 2023-07-05 04:00:38,495 (trainer:732) INFO: 13epoch:train:7801-7900batch: iter_time=9.455e-05, forward_time=0.144, loss_ctc=81.656, loss_att=66.033, acc=0.680, loss=70.720, backward_time=1.242, grad_norm=93.460, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.030e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 04:03:16,017 (trainer:732) INFO: 13epoch:train:7901-8000batch: iter_time=1.008e-04, forward_time=0.143, loss_ctc=85.972, loss_att=74.825, acc=0.670, loss=78.169, backward_time=1.244, grad_norm=112.494, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.029e-04, train_time=3.150 +[gpub015:0/64] 2023-07-05 04:05:53,016 (trainer:732) INFO: 13epoch:train:8001-8100batch: iter_time=1.015e-04, forward_time=0.143, loss_ctc=74.498, loss_att=55.317, acc=0.680, loss=61.071, backward_time=1.240, grad_norm=91.464, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.029e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 04:08:30,443 (trainer:732) INFO: 13epoch:train:8101-8200batch: iter_time=9.791e-05, forward_time=0.144, loss_ctc=69.863, loss_att=54.241, acc=0.685, loss=58.927, backward_time=1.242, grad_norm=82.075, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.029e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 04:11:07,674 (trainer:732) INFO: 13epoch:train:8201-8300batch: iter_time=1.092e-04, forward_time=0.144, loss_ctc=68.595, loss_att=52.123, acc=0.706, loss=57.064, backward_time=1.242, grad_norm=79.546, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.181, optim0_lr0=1.028e-04, train_time=3.144 +[gpub015:0/64] 2023-07-05 04:12:00,871 (multiple_iter_factory:32) INFO: Building 10th iter-factory... +[gpub015:0/64] 2023-07-05 04:12:18,984 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 04:12:22,440 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 04:12:22,440 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, +[gpub015:0/64] 2023-07-05 04:12:22,446 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 04:18:32,238 (trainer:732) INFO: 13epoch:train:8301-8400batch: iter_time=1.223, forward_time=0.146, loss_ctc=72.469, loss_att=59.337, acc=0.686, loss=63.277, backward_time=1.252, grad_norm=96.645, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.028e-04, train_time=8.891 +[gpub015:0/64] 2023-07-05 04:21:10,438 (trainer:732) INFO: 13epoch:train:8401-8500batch: iter_time=1.150e-04, forward_time=0.145, loss_ctc=73.753, loss_att=57.673, acc=0.674, loss=62.497, backward_time=1.243, grad_norm=122.511, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.027e-04, train_time=3.164 +[gpub015:0/64] 2023-07-05 04:23:48,063 (trainer:732) INFO: 13epoch:train:8501-8600batch: iter_time=1.133e-04, forward_time=0.146, loss_ctc=73.060, loss_att=55.876, acc=0.686, loss=61.032, backward_time=1.241, grad_norm=95.020, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.027e-04, train_time=3.152 +[gpub015:0/64] 2023-07-05 04:26:25,397 (trainer:732) INFO: 13epoch:train:8601-8700batch: iter_time=1.242e-04, forward_time=0.145, loss_ctc=82.342, loss_att=65.676, acc=0.670, loss=70.675, backward_time=1.242, grad_norm=98.947, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.026e-04, train_time=3.146 +[gpub015:0/64] 2023-07-05 04:29:04,670 (trainer:732) INFO: 13epoch:train:8701-8800batch: iter_time=1.143e-04, forward_time=0.146, loss_ctc=82.940, loss_att=68.734, acc=0.670, loss=72.996, backward_time=1.243, grad_norm=94.812, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.026e-04, train_time=3.185 +[gpub015:0/64] 2023-07-05 04:31:43,040 (trainer:732) INFO: 13epoch:train:8801-8900batch: iter_time=1.124e-04, forward_time=0.147, loss_ctc=80.299, loss_att=65.836, acc=0.675, loss=70.175, backward_time=1.243, grad_norm=110.899, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.026e-04, train_time=3.167 +[gpub015:0/64] 2023-07-05 04:34:19,985 (trainer:732) INFO: 13epoch:train:8901-9000batch: iter_time=1.233e-04, forward_time=0.145, loss_ctc=74.553, loss_att=55.213, acc=0.672, loss=61.015, backward_time=1.240, grad_norm=114.627, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.025e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 04:36:57,034 (trainer:732) INFO: 13epoch:train:9001-9100batch: iter_time=1.199e-04, forward_time=0.145, loss_ctc=72.019, loss_att=59.355, acc=0.687, loss=63.154, backward_time=1.241, grad_norm=90.972, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.025e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 04:38:44,132 (multiple_iter_factory:32) INFO: Building 11th iter-factory... +[gpub015:0/64] 2023-07-05 04:39:02,444 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 04:39:06,144 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.8", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 04:39:06,144 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.8, +[gpub015:0/64] 2023-07-05 04:39:06,150 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 04:43:21,746 (trainer:732) INFO: 13epoch:train:9101-9200batch: iter_time=1.211, forward_time=0.146, loss_ctc=72.956, loss_att=53.606, acc=0.696, loss=59.411, backward_time=1.254, grad_norm=87.995, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.024e-04, train_time=7.694 +[gpub015:0/64] 2023-07-05 04:46:02,195 (trainer:732) INFO: 13epoch:train:9201-9300batch: iter_time=1.155e-04, forward_time=0.144, loss_ctc=71.310, loss_att=57.178, acc=0.677, loss=61.418, backward_time=1.247, grad_norm=95.519, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.024e-04, train_time=3.209 +[gpub015:0/64] 2023-07-05 04:48:40,368 (trainer:732) INFO: 13epoch:train:9301-9400batch: iter_time=1.006e-04, forward_time=0.145, loss_ctc=73.757, loss_att=55.866, acc=0.683, loss=61.234, backward_time=1.243, grad_norm=93.753, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.023e-04, train_time=3.163 +[gpub015:0/64] 2023-07-05 04:51:18,276 (trainer:732) INFO: 13epoch:train:9401-9500batch: iter_time=1.172e-04, forward_time=0.144, loss_ctc=76.011, loss_att=59.441, acc=0.685, loss=64.412, backward_time=1.242, grad_norm=89.515, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.023e-04, train_time=3.158 +[gpub015:0/64] 2023-07-05 04:53:55,349 (trainer:732) INFO: 13epoch:train:9501-9600batch: iter_time=1.167e-04, forward_time=0.144, loss_ctc=79.471, loss_att=63.716, acc=0.670, loss=68.443, backward_time=1.241, grad_norm=101.257, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.023e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 04:56:33,102 (trainer:732) INFO: 13epoch:train:9601-9700batch: iter_time=9.937e-05, forward_time=0.145, loss_ctc=89.171, loss_att=76.309, acc=0.664, loss=80.167, backward_time=1.245, grad_norm=108.677, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.022e-04, train_time=3.155 +[gpub015:0/64] 2023-07-05 04:59:10,081 (trainer:732) INFO: 13epoch:train:9701-9800batch: iter_time=1.165e-04, forward_time=0.144, loss_ctc=68.036, loss_att=50.680, acc=0.681, loss=55.886, backward_time=1.240, grad_norm=79.168, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.022e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 05:01:47,472 (trainer:732) INFO: 13epoch:train:9801-9900batch: iter_time=1.160e-04, forward_time=0.145, loss_ctc=75.666, loss_att=61.758, acc=0.685, loss=65.930, backward_time=1.245, grad_norm=98.942, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.021e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 05:04:24,479 (trainer:732) INFO: 13epoch:train:9901-10000batch: iter_time=1.035e-04, forward_time=0.145, loss_ctc=64.966, loss_att=46.260, acc=0.711, loss=51.872, backward_time=1.241, grad_norm=91.008, clip=100.000, loss_scale=3.518e+13, optim_step_time=0.182, optim0_lr0=1.021e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 05:17:35,091 (trainer:338) INFO: 13epoch results: [train] iter_time=0.157, forward_time=0.145, loss_ctc=77.137, loss_att=60.711, acc=0.679, loss=65.639, backward_time=1.245, grad_norm=100.824, clip=100.000, loss_scale=2.287e+13, optim_step_time=0.181, optim0_lr0=1.043e-04, train_time=3.766, time=5 hours, 14 minutes and 12.9 seconds, total_count=100000, gpu_max_cached_mem_GB=37.139, [valid] loss_ctc=55.311, cer_ctc=0.308, loss_att=44.884, acc=0.629, cer=0.442, wer=0.994, loss=48.012, time=7 minutes and 2.64 seconds, total_count=10626, gpu_max_cached_mem_GB=37.139, [att_plot] time=5 minutes and 45.23 seconds, total_count=0, gpu_max_cached_mem_GB=37.139 +[gpub015:0/64] 2023-07-05 05:17:50,410 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub015:0/64] 2023-07-05 05:17:50,415 (trainer:440) INFO: The model files were removed: exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/8epoch.pth +[gpub015:0/64] 2023-07-05 05:17:50,415 (trainer:272) INFO: 14/100epoch started. Estimated time to finish: 2 weeks, 5 days and 11 hours +[gpub015:0/64] 2023-07-05 05:17:50,419 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub015:0/64] 2023-07-05 05:18:08,009 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 05:18:11,352 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 05:18:11,353 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, +[gpub015:0/64] 2023-07-05 05:18:11,359 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 05:22:38,320 (trainer:732) INFO: 14epoch:train:1-100batch: iter_time=1.236, forward_time=0.146, loss_ctc=67.492, loss_att=49.941, acc=0.685, loss=55.206, backward_time=1.261, grad_norm=90.648, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.020e-04, train_time=5.758 +[gpub015:0/64] 2023-07-05 05:25:16,237 (trainer:732) INFO: 14epoch:train:101-200batch: iter_time=1.042e-04, forward_time=0.145, loss_ctc=76.460, loss_att=60.179, acc=0.660, loss=65.063, backward_time=1.240, grad_norm=93.506, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.020e-04, train_time=3.158 +[gpub015:0/64] 2023-07-05 05:27:54,151 (trainer:732) INFO: 14epoch:train:201-300batch: iter_time=1.080e-04, forward_time=0.146, loss_ctc=71.779, loss_att=54.026, acc=0.681, loss=59.352, backward_time=1.241, grad_norm=86.178, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.020e-04, train_time=3.158 +[gpub015:0/64] 2023-07-05 05:30:31,984 (trainer:732) INFO: 14epoch:train:301-400batch: iter_time=1.215e-04, forward_time=0.146, loss_ctc=74.923, loss_att=54.424, acc=0.671, loss=60.574, backward_time=1.241, grad_norm=97.880, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.019e-04, train_time=3.156 +[gpub015:0/64] 2023-07-05 05:33:17,013 (trainer:732) INFO: 14epoch:train:401-500batch: iter_time=5.366e-04, forward_time=0.209, loss_ctc=74.569, loss_att=59.503, acc=0.676, loss=64.023, backward_time=1.247, grad_norm=94.127, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.184, optim0_lr0=1.019e-04, train_time=3.300 +[gpub015:0/64] 2023-07-05 05:36:02,320 (trainer:732) INFO: 14epoch:train:501-600batch: iter_time=1.064e-04, forward_time=0.200, loss_ctc=68.014, loss_att=54.675, acc=0.665, loss=58.677, backward_time=1.252, grad_norm=82.294, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.184, optim0_lr0=1.018e-04, train_time=3.306 +[gpub015:0/64] 2023-07-05 05:38:53,673 (trainer:732) INFO: 14epoch:train:601-700batch: iter_time=1.074e-04, forward_time=0.162, loss_ctc=79.869, loss_att=65.427, acc=0.663, loss=69.759, backward_time=1.260, grad_norm=100.551, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.183, optim0_lr0=1.018e-04, train_time=3.427 +[gpub015:0/64] 2023-07-05 05:41:42,236 (trainer:732) INFO: 14epoch:train:701-800batch: iter_time=1.092e-04, forward_time=0.146, loss_ctc=87.368, loss_att=57.776, acc=0.685, loss=66.653, backward_time=1.254, grad_norm=128.453, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.017e-04, train_time=3.371 +[gpub015:0/64] 2023-07-05 05:42:44,919 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub015:0/64] 2023-07-05 05:43:02,321 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 05:43:05,674 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 05:43:05,674 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, +[gpub015:0/64] 2023-07-05 05:43:05,680 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 05:47:29,182 (trainer:732) INFO: 14epoch:train:801-900batch: iter_time=1.271, forward_time=0.146, loss_ctc=80.548, loss_att=57.074, acc=0.681, loss=64.116, backward_time=1.258, grad_norm=96.622, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.017e-04, train_time=6.939 +[gpub015:0/64] 2023-07-05 05:50:06,931 (trainer:732) INFO: 14epoch:train:901-1000batch: iter_time=1.130e-04, forward_time=0.146, loss_ctc=78.396, loss_att=65.274, acc=0.669, loss=69.210, backward_time=1.243, grad_norm=98.587, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.017e-04, train_time=3.155 +[gpub015:0/64] 2023-07-05 05:52:44,281 (trainer:732) INFO: 14epoch:train:1001-1100batch: iter_time=1.191e-04, forward_time=0.146, loss_ctc=70.269, loss_att=52.890, acc=0.696, loss=58.104, backward_time=1.243, grad_norm=92.744, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.016e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 05:55:21,561 (trainer:732) INFO: 14epoch:train:1101-1200batch: iter_time=1.158e-04, forward_time=0.145, loss_ctc=71.987, loss_att=52.009, acc=0.680, loss=58.002, backward_time=1.242, grad_norm=90.603, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.016e-04, train_time=3.145 +[gpub015:0/64] 2023-07-05 05:57:58,721 (trainer:732) INFO: 14epoch:train:1201-1300batch: iter_time=1.267e-04, forward_time=0.145, loss_ctc=73.245, loss_att=58.533, acc=0.680, loss=62.946, backward_time=1.242, grad_norm=93.277, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.015e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 06:00:35,732 (trainer:732) INFO: 14epoch:train:1301-1400batch: iter_time=1.379e-04, forward_time=0.144, loss_ctc=69.153, loss_att=56.204, acc=0.675, loss=60.089, backward_time=1.242, grad_norm=87.236, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.015e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 06:03:13,091 (trainer:732) INFO: 14epoch:train:1401-1500batch: iter_time=1.207e-04, forward_time=0.145, loss_ctc=75.620, loss_att=62.912, acc=0.676, loss=66.725, backward_time=1.242, grad_norm=122.064, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.014e-04, train_time=3.147 +[gpub015:0/64] 2023-07-05 06:05:50,085 (trainer:732) INFO: 14epoch:train:1501-1600batch: iter_time=1.158e-04, forward_time=0.144, loss_ctc=86.422, loss_att=59.210, acc=0.686, loss=67.374, backward_time=1.240, grad_norm=121.982, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.014e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 06:07:45,814 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub015:0/64] 2023-07-05 06:08:04,021 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 06:08:07,429 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 06:08:07,429 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, +[gpub015:0/64] 2023-07-05 06:08:07,435 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 06:11:42,622 (trainer:732) INFO: 14epoch:train:1601-1700batch: iter_time=1.516, forward_time=0.145, loss_ctc=91.881, loss_att=63.528, acc=0.682, loss=72.034, backward_time=1.250, grad_norm=122.952, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.014e-04, train_time=7.051 +[gpub015:0/64] 2023-07-05 06:14:20,460 (trainer:732) INFO: 14epoch:train:1701-1800batch: iter_time=1.177e-04, forward_time=0.145, loss_ctc=65.864, loss_att=52.353, acc=0.669, loss=56.407, backward_time=1.243, grad_norm=85.260, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.013e-04, train_time=3.157 +[gpub015:0/64] 2023-07-05 06:16:57,381 (trainer:732) INFO: 14epoch:train:1801-1900batch: iter_time=1.220e-04, forward_time=0.143, loss_ctc=80.381, loss_att=62.098, acc=0.676, loss=67.583, backward_time=1.240, grad_norm=109.302, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.013e-04, train_time=3.138 +[gpub015:0/64] 2023-07-05 06:19:34,703 (trainer:732) INFO: 14epoch:train:1901-2000batch: iter_time=1.216e-04, forward_time=0.146, loss_ctc=65.835, loss_att=46.913, acc=0.695, loss=52.589, backward_time=1.240, grad_norm=77.097, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.012e-04, train_time=3.146 +[gpub015:0/64] 2023-07-05 06:22:11,686 (trainer:732) INFO: 14epoch:train:2001-2100batch: iter_time=1.216e-04, forward_time=0.144, loss_ctc=74.781, loss_att=55.605, acc=0.680, loss=61.358, backward_time=1.240, grad_norm=98.143, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.012e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 06:24:48,909 (trainer:732) INFO: 14epoch:train:2101-2200batch: iter_time=1.146e-04, forward_time=0.145, loss_ctc=70.364, loss_att=58.732, acc=0.665, loss=62.221, backward_time=1.243, grad_norm=96.995, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.012e-04, train_time=3.144 +[gpub015:0/64] 2023-07-05 06:27:26,091 (trainer:732) INFO: 14epoch:train:2201-2300batch: iter_time=1.191e-04, forward_time=0.145, loss_ctc=71.398, loss_att=61.066, acc=0.665, loss=64.165, backward_time=1.242, grad_norm=86.951, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.011e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 06:30:03,134 (trainer:732) INFO: 14epoch:train:2301-2400batch: iter_time=1.059e-04, forward_time=0.145, loss_ctc=79.594, loss_att=55.367, acc=0.691, loss=62.635, backward_time=1.241, grad_norm=111.641, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.011e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 06:32:40,642 (trainer:732) INFO: 14epoch:train:2401-2500batch: iter_time=1.181e-04, forward_time=0.146, loss_ctc=91.170, loss_att=66.737, acc=0.666, loss=74.067, backward_time=1.244, grad_norm=122.083, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.010e-04, train_time=3.150 +[gpub015:0/64] 2023-07-05 06:32:43,499 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub015:0/64] 2023-07-05 06:33:01,325 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 06:33:04,751 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 06:33:04,751 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpub015:0/64] 2023-07-05 06:33:04,758 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 06:38:39,312 (trainer:732) INFO: 14epoch:train:2501-2600batch: iter_time=1.217, forward_time=0.147, loss_ctc=66.054, loss_att=47.825, acc=0.704, loss=53.294, backward_time=1.250, grad_norm=90.048, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.010e-04, train_time=7.173 +[gpub015:0/64] 2023-07-05 06:41:16,644 (trainer:732) INFO: 14epoch:train:2601-2700batch: iter_time=1.133e-04, forward_time=0.145, loss_ctc=74.725, loss_att=59.961, acc=0.674, loss=64.390, backward_time=1.242, grad_norm=98.791, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.010e-04, train_time=3.146 +[gpub015:0/64] 2023-07-05 06:43:53,864 (trainer:732) INFO: 14epoch:train:2701-2800batch: iter_time=1.041e-04, forward_time=0.145, loss_ctc=71.636, loss_att=53.864, acc=0.695, loss=59.196, backward_time=1.242, grad_norm=88.306, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.009e-04, train_time=3.144 +[gpub015:0/64] 2023-07-05 06:46:30,941 (trainer:732) INFO: 14epoch:train:2801-2900batch: iter_time=1.197e-04, forward_time=0.146, loss_ctc=72.656, loss_att=52.692, acc=0.678, loss=58.681, backward_time=1.241, grad_norm=93.367, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.009e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 06:49:08,213 (trainer:732) INFO: 14epoch:train:2901-3000batch: iter_time=1.044e-04, forward_time=0.144, loss_ctc=72.828, loss_att=58.523, acc=0.687, loss=62.815, backward_time=1.242, grad_norm=92.957, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.008e-04, train_time=3.145 +[gpub015:0/64] 2023-07-05 06:51:45,294 (trainer:732) INFO: 14epoch:train:3001-3100batch: iter_time=1.069e-04, forward_time=0.143, loss_ctc=68.485, loss_att=54.235, acc=0.679, loss=58.510, backward_time=1.241, grad_norm=90.151, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.008e-04, train_time=3.141 +[gpub015:0/64] 2023-07-05 06:54:28,653 (trainer:732) INFO: 14epoch:train:3101-3200batch: iter_time=9.495e-05, forward_time=0.144, loss_ctc=79.365, loss_att=65.692, acc=0.672, loss=69.794, backward_time=1.247, grad_norm=93.123, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.007e-04, train_time=3.267 +[gpub015:0/64] 2023-07-05 06:57:07,175 (trainer:732) INFO: 14epoch:train:3201-3300batch: iter_time=9.992e-05, forward_time=0.144, loss_ctc=85.733, loss_att=57.493, acc=0.694, loss=65.965, backward_time=1.243, grad_norm=134.602, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.181, optim0_lr0=1.007e-04, train_time=3.170 +[gpub015:0/64] 2023-07-05 06:58:02,021 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub015:0/64] 2023-07-05 06:58:20,249 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 06:58:23,672 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.8", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 06:58:23,673 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.8, +[gpub015:0/64] 2023-07-05 06:58:23,679 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 07:02:56,111 (trainer:732) INFO: 14epoch:train:3301-3400batch: iter_time=1.206, forward_time=0.206, loss_ctc=75.066, loss_att=55.125, acc=0.684, loss=61.108, backward_time=1.254, grad_norm=97.874, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.187, optim0_lr0=1.007e-04, train_time=6.978 +[gpub015:0/64] 2023-07-05 07:05:33,511 (trainer:732) INFO: 14epoch:train:3401-3500batch: iter_time=1.175e-04, forward_time=0.147, loss_ctc=80.268, loss_att=64.486, acc=0.664, loss=69.220, backward_time=1.243, grad_norm=102.801, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.006e-04, train_time=3.148 +[gpub015:0/64] 2023-07-05 07:08:10,474 (trainer:732) INFO: 14epoch:train:3501-3600batch: iter_time=1.258e-04, forward_time=0.146, loss_ctc=70.086, loss_att=52.785, acc=0.689, loss=57.975, backward_time=1.240, grad_norm=91.850, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.006e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 07:10:47,426 (trainer:732) INFO: 14epoch:train:3601-3700batch: iter_time=1.143e-04, forward_time=0.146, loss_ctc=68.709, loss_att=48.855, acc=0.690, loss=54.811, backward_time=1.241, grad_norm=84.286, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.005e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 07:13:25,490 (trainer:732) INFO: 14epoch:train:3701-3800batch: iter_time=1.245e-04, forward_time=0.146, loss_ctc=71.880, loss_att=56.870, acc=0.684, loss=61.373, backward_time=1.243, grad_norm=101.212, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.005e-04, train_time=3.161 +[gpub015:0/64] 2023-07-05 07:16:02,505 (trainer:732) INFO: 14epoch:train:3801-3900batch: iter_time=1.332e-04, forward_time=0.146, loss_ctc=65.690, loss_att=53.660, acc=0.671, loss=57.269, backward_time=1.240, grad_norm=103.338, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.005e-04, train_time=3.140 +[gpub015:0/64] 2023-07-05 07:18:39,723 (trainer:732) INFO: 14epoch:train:3901-4000batch: iter_time=1.226e-04, forward_time=0.147, loss_ctc=75.033, loss_att=62.084, acc=0.672, loss=65.969, backward_time=1.243, grad_norm=101.234, clip=100.000, loss_scale=7.037e+13, optim_step_time=0.182, optim0_lr0=1.004e-04, train_time=3.144 +[gpub015:0/64] 2023-07-05 07:21:16,597 (trainer:732) INFO: 14epoch:train:4001-4100batch: iter_time=1.200e-04, forward_time=0.145, loss_ctc=83.330, loss_att=57.692, acc=0.689, loss=65.383, backward_time=1.241, grad_norm=106.096, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.182, optim0_lr0=1.004e-04, train_time=3.137 +[gpub015:0/64] 2023-07-05 07:23:05,096 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub015:0/64] 2023-07-05 07:23:23,163 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 07:23:26,571 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 07:23:26,571 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, +[gpub015:0/64] 2023-07-05 07:23:26,577 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 07:27:52,257 (trainer:732) INFO: 14epoch:train:4101-4200batch: iter_time=1.222, forward_time=0.146, loss_ctc=87.297, loss_att=61.477, acc=0.683, loss=69.223, backward_time=1.252, grad_norm=113.635, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.182, optim0_lr0=1.003e-04, train_time=7.913 +[gpub015:0/64] 2023-07-05 07:30:30,045 (trainer:732) INFO: 14epoch:train:4201-4300batch: iter_time=9.853e-05, forward_time=0.145, loss_ctc=67.259, loss_att=52.733, acc=0.675, loss=57.091, backward_time=1.241, grad_norm=99.259, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.003e-04, train_time=3.156 +[gpub015:0/64] 2023-07-05 07:33:07,210 (trainer:732) INFO: 14epoch:train:4301-4400batch: iter_time=1.197e-04, forward_time=0.144, loss_ctc=75.113, loss_att=56.926, acc=0.690, loss=62.382, backward_time=1.242, grad_norm=106.822, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.003e-04, train_time=3.143 +[gpub015:0/64] 2023-07-05 07:35:43,948 (trainer:732) INFO: 14epoch:train:4401-4500batch: iter_time=1.184e-04, forward_time=0.143, loss_ctc=68.567, loss_att=50.621, acc=0.682, loss=56.005, backward_time=1.240, grad_norm=83.577, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.002e-04, train_time=3.135 +[gpub015:0/64] 2023-07-05 07:38:30,505 (trainer:732) INFO: 14epoch:train:4501-4600batch: iter_time=1.128e-04, forward_time=0.146, loss_ctc=71.674, loss_att=50.888, acc=0.691, loss=57.124, backward_time=1.253, grad_norm=85.775, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.002e-04, train_time=3.331 +[gpub015:0/64] 2023-07-05 07:41:07,397 (trainer:732) INFO: 14epoch:train:4601-4700batch: iter_time=1.265e-04, forward_time=0.143, loss_ctc=73.227, loss_att=60.909, acc=0.677, loss=64.604, backward_time=1.241, grad_norm=95.135, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.001e-04, train_time=3.138 +[gpub015:0/64] 2023-07-05 07:43:46,708 (trainer:732) INFO: 14epoch:train:4701-4800batch: iter_time=1.181e-04, forward_time=0.145, loss_ctc=68.192, loss_att=55.093, acc=0.669, loss=59.023, backward_time=1.241, grad_norm=88.359, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.001e-04, train_time=3.186 +[gpub015:0/64] 2023-07-05 07:46:23,653 (trainer:732) INFO: 14epoch:train:4801-4900batch: iter_time=1.105e-04, forward_time=0.145, loss_ctc=79.220, loss_att=59.755, acc=0.682, loss=65.594, backward_time=1.241, grad_norm=120.294, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.001e-04, train_time=3.139 +[gpub015:0/64] 2023-07-05 07:49:07,600 (trainer:732) INFO: 14epoch:train:4901-5000batch: iter_time=1.119e-04, forward_time=0.145, loss_ctc=86.729, loss_att=63.418, acc=0.679, loss=70.412, backward_time=1.251, grad_norm=111.675, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=1.000e-04, train_time=3.279 +[gpub015:0/64] 2023-07-05 07:49:10,466 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub015:0/64] 2023-07-05 07:49:28,643 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 07:49:32,109 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 07:49:32,109 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, +[gpub015:0/64] 2023-07-05 07:49:32,115 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 07:54:27,305 (trainer:732) INFO: 14epoch:train:5001-5100batch: iter_time=1.182, forward_time=0.145, loss_ctc=65.866, loss_att=47.473, acc=0.709, loss=52.991, backward_time=1.255, grad_norm=80.499, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.998e-05, train_time=6.394 +[gpub015:0/64] 2023-07-05 07:57:05,479 (trainer:732) INFO: 14epoch:train:5101-5200batch: iter_time=1.089e-04, forward_time=0.146, loss_ctc=72.265, loss_att=58.470, acc=0.677, loss=62.608, backward_time=1.242, grad_norm=95.899, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.994e-05, train_time=3.163 +[gpub015:0/64] 2023-07-05 07:59:42,843 (trainer:732) INFO: 14epoch:train:5201-5300batch: iter_time=1.169e-04, forward_time=0.146, loss_ctc=68.799, loss_att=51.505, acc=0.703, loss=56.693, backward_time=1.242, grad_norm=91.972, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.990e-05, train_time=3.147 +[gpub015:0/64] 2023-07-05 08:02:19,845 (trainer:732) INFO: 14epoch:train:5301-5400batch: iter_time=1.145e-04, forward_time=0.145, loss_ctc=71.880, loss_att=51.948, acc=0.684, loss=57.928, backward_time=1.240, grad_norm=84.364, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.986e-05, train_time=3.140 +[gpub015:0/64] 2023-07-05 08:04:57,349 (trainer:732) INFO: 14epoch:train:5401-5500batch: iter_time=1.105e-04, forward_time=0.146, loss_ctc=71.747, loss_att=58.949, acc=0.688, loss=62.788, backward_time=1.241, grad_norm=95.166, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.982e-05, train_time=3.150 +[gpub015:0/64] 2023-07-05 08:07:36,385 (trainer:732) INFO: 14epoch:train:5501-5600batch: iter_time=1.111e-04, forward_time=0.146, loss_ctc=66.692, loss_att=52.899, acc=0.683, loss=57.037, backward_time=1.241, grad_norm=86.102, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.978e-05, train_time=3.181 +[gpub015:0/64] 2023-07-05 08:10:15,268 (trainer:732) INFO: 14epoch:train:5601-5700batch: iter_time=1.079e-04, forward_time=0.146, loss_ctc=76.339, loss_att=65.761, acc=0.678, loss=68.934, backward_time=1.244, grad_norm=93.247, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.974e-05, train_time=3.177 +[gpub015:0/64] 2023-07-05 08:12:52,081 (trainer:732) INFO: 14epoch:train:5701-5800batch: iter_time=1.003e-04, forward_time=0.144, loss_ctc=82.139, loss_att=57.379, acc=0.692, loss=64.807, backward_time=1.240, grad_norm=128.174, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.970e-05, train_time=3.136 +[gpub015:0/64] 2023-07-05 08:13:46,941 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub015:0/64] 2023-07-05 08:14:04,844 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 08:14:08,310 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.5", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 08:14:08,310 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.5, +[gpub015:0/64] 2023-07-05 08:14:08,316 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 08:19:19,699 (trainer:732) INFO: 14epoch:train:5801-5900batch: iter_time=1.217, forward_time=0.145, loss_ctc=71.224, loss_att=52.241, acc=0.699, loss=57.936, backward_time=1.247, grad_norm=98.314, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.966e-05, train_time=7.752 +[gpub015:0/64] 2023-07-05 08:21:59,636 (trainer:732) INFO: 14epoch:train:5901-6000batch: iter_time=1.174e-04, forward_time=0.145, loss_ctc=74.291, loss_att=61.484, acc=0.686, loss=65.326, backward_time=1.251, grad_norm=92.289, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.962e-05, train_time=3.199 +[gpub015:0/64] 2023-07-05 08:24:37,095 (trainer:732) INFO: 14epoch:train:6001-6100batch: iter_time=1.225e-04, forward_time=0.145, loss_ctc=73.843, loss_att=54.203, acc=0.692, loss=60.095, backward_time=1.241, grad_norm=98.891, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.958e-05, train_time=3.149 +[gpub015:0/64] 2023-07-05 08:27:16,819 (trainer:732) INFO: 14epoch:train:6101-6200batch: iter_time=1.314e-04, forward_time=0.145, loss_ctc=63.824, loss_att=46.318, acc=0.701, loss=51.570, backward_time=1.242, grad_norm=82.169, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.954e-05, train_time=3.194 +[gpub015:0/64] 2023-07-05 08:29:58,576 (trainer:732) INFO: 14epoch:train:6201-6300batch: iter_time=1.331e-04, forward_time=0.146, loss_ctc=76.866, loss_att=57.439, acc=0.689, loss=63.267, backward_time=1.252, grad_norm=97.819, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.180, optim0_lr0=9.950e-05, train_time=3.235 +[gpub015:0/64] 2023-07-05 08:32:35,761 (trainer:732) INFO: 14epoch:train:6301-6400batch: iter_time=1.293e-04, forward_time=0.144, loss_ctc=66.634, loss_att=56.034, acc=0.687, loss=59.214, backward_time=1.241, grad_norm=102.060, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.946e-05, train_time=3.143 +[gpub015:0/64] 2023-07-05 08:35:12,784 (trainer:732) INFO: 14epoch:train:6401-6500batch: iter_time=1.259e-04, forward_time=0.145, loss_ctc=72.410, loss_att=59.662, acc=0.683, loss=63.486, backward_time=1.242, grad_norm=99.566, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.942e-05, train_time=3.140 +[gpub015:0/64] 2023-07-05 08:37:49,786 (trainer:732) INFO: 14epoch:train:6501-6600batch: iter_time=1.133e-04, forward_time=0.144, loss_ctc=83.069, loss_att=58.236, acc=0.699, loss=65.686, backward_time=1.242, grad_norm=114.626, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.938e-05, train_time=3.140 +[gpub015:0/64] 2023-07-05 08:39:39,785 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub015:0/64] 2023-07-05 08:39:57,765 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 08:40:01,165 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 08:40:01,165 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, +[gpub015:0/64] 2023-07-05 08:40:01,171 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 08:45:39,161 (trainer:732) INFO: 14epoch:train:6601-6700batch: iter_time=1.302, forward_time=0.147, loss_ctc=81.601, loss_att=58.591, acc=0.689, loss=65.494, backward_time=1.254, grad_norm=111.016, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.182, optim0_lr0=9.935e-05, train_time=9.386 +[gpub015:0/64] 2023-07-05 08:48:20,753 (trainer:732) INFO: 14epoch:train:6701-6800batch: iter_time=1.204e-04, forward_time=0.152, loss_ctc=64.824, loss_att=53.322, acc=0.679, loss=56.773, backward_time=1.251, grad_norm=91.480, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.931e-05, train_time=3.233 +[gpub015:0/64] 2023-07-05 08:50:57,965 (trainer:732) INFO: 14epoch:train:6801-6900batch: iter_time=1.248e-04, forward_time=0.145, loss_ctc=74.189, loss_att=56.752, acc=0.696, loss=61.983, backward_time=1.242, grad_norm=88.689, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.927e-05, train_time=3.144 +[gpub015:0/64] 2023-07-05 08:53:34,859 (trainer:732) INFO: 14epoch:train:6901-7000batch: iter_time=1.250e-04, forward_time=0.144, loss_ctc=67.349, loss_att=48.904, acc=0.691, loss=54.437, backward_time=1.240, grad_norm=96.258, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.923e-05, train_time=3.138 +[gpub015:0/64] 2023-07-05 08:56:11,811 (trainer:732) INFO: 14epoch:train:7001-7100batch: iter_time=1.255e-04, forward_time=0.145, loss_ctc=70.955, loss_att=50.384, acc=0.695, loss=56.555, backward_time=1.239, grad_norm=99.455, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.919e-05, train_time=3.139 +[gpub015:0/64] 2023-07-05 08:58:48,943 (trainer:732) INFO: 14epoch:train:7101-7200batch: iter_time=1.220e-04, forward_time=0.143, loss_ctc=73.206, loss_att=61.616, acc=0.676, loss=65.093, backward_time=1.241, grad_norm=86.657, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.915e-05, train_time=3.142 +[gpub015:0/64] 2023-07-05 09:01:26,106 (trainer:732) INFO: 14epoch:train:7201-7300batch: iter_time=1.328e-04, forward_time=0.146, loss_ctc=66.192, loss_att=55.575, acc=0.671, loss=58.760, backward_time=1.241, grad_norm=92.181, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.911e-05, train_time=3.143 +[gpub015:0/64] 2023-07-05 09:04:03,534 (trainer:732) INFO: 14epoch:train:7301-7400batch: iter_time=1.310e-04, forward_time=0.145, loss_ctc=79.538, loss_att=59.235, acc=0.689, loss=65.326, backward_time=1.242, grad_norm=100.524, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.907e-05, train_time=3.148 +[gpub015:0/64] 2023-07-05 09:06:42,404 (trainer:732) INFO: 14epoch:train:7401-7500batch: iter_time=1.288e-04, forward_time=0.144, loss_ctc=86.486, loss_att=62.275, acc=0.679, loss=69.538, backward_time=1.242, grad_norm=101.606, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.181, optim0_lr0=9.903e-05, train_time=3.177 +[gpub015:0/64] 2023-07-05 09:06:48,753 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub015:0/64] 2023-07-05 09:07:06,613 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 09:07:10,023 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 09:07:10,023 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, +[gpub015:0/64] 2023-07-05 09:07:10,095 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 09:14:08,695 (trainer:732) INFO: 14epoch:train:7501-7600batch: iter_time=1.646, forward_time=0.177, loss_ctc=69.696, loss_att=50.684, acc=0.697, loss=56.387, backward_time=1.258, grad_norm=83.667, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.183, optim0_lr0=9.899e-05, train_time=8.925 +[gpub015:0/64] 2023-07-05 09:16:46,222 (trainer:732) INFO: 14epoch:train:7601-7700batch: iter_time=9.515e-05, forward_time=0.144, loss_ctc=69.322, loss_att=56.992, acc=0.685, loss=60.691, backward_time=1.242, grad_norm=99.714, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.180, optim0_lr0=9.896e-05, train_time=3.151 +[gpub015:0/64] 2023-07-05 09:19:23,334 (trainer:732) INFO: 14epoch:train:7701-7800batch: iter_time=1.117e-04, forward_time=0.144, loss_ctc=71.131, loss_att=52.075, acc=0.696, loss=57.792, backward_time=1.242, grad_norm=86.667, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.180, optim0_lr0=9.892e-05, train_time=3.142 +[gpub015:0/64] 2023-07-05 09:22:00,431 (trainer:732) INFO: 14epoch:train:7801-7900batch: iter_time=1.339e-04, forward_time=0.144, loss_ctc=73.686, loss_att=51.618, acc=0.692, loss=58.238, backward_time=1.241, grad_norm=96.390, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.180, optim0_lr0=9.888e-05, train_time=3.142 +[gpub015:0/64] 2023-07-05 09:24:37,992 (trainer:732) INFO: 14epoch:train:7901-8000batch: iter_time=1.229e-04, forward_time=0.145, loss_ctc=70.794, loss_att=61.014, acc=0.684, loss=63.948, backward_time=1.244, grad_norm=96.805, clip=100.000, loss_scale=1.407e+14, optim_step_time=0.180, optim0_lr0=9.884e-05, train_time=3.151 +[gpub015:0/64] 2023-07-05 09:27:15,174 (trainer:732) INFO: 14epoch:train:8001-8100batch: iter_time=1.156e-04, forward_time=0.144, loss_ctc=67.169, loss_att=55.923, acc=0.676, loss=59.297, backward_time=1.241, grad_norm=91.698, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.880e-05, train_time=3.143 +[gpub015:0/64] 2023-07-05 09:29:52,590 (trainer:732) INFO: 14epoch:train:8101-8200batch: iter_time=1.278e-04, forward_time=0.145, loss_ctc=78.525, loss_att=62.256, acc=0.685, loss=67.136, backward_time=1.243, grad_norm=99.114, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.876e-05, train_time=3.148 +[gpub015:0/64] 2023-07-05 09:32:30,203 (trainer:732) INFO: 14epoch:train:8201-8300batch: iter_time=1.219e-04, forward_time=0.145, loss_ctc=81.362, loss_att=55.922, acc=0.694, loss=63.554, backward_time=1.242, grad_norm=95.643, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.872e-05, train_time=3.152 +[gpub015:0/64] 2023-07-05 09:33:34,938 (multiple_iter_factory:32) INFO: Building 10th iter-factory... +[gpub015:0/64] 2023-07-05 09:33:53,041 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 09:33:56,452 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 09:33:56,452 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, +[gpub015:0/64] 2023-07-05 09:33:56,459 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +[gpub015:0/64] 2023-07-05 09:39:03,625 (trainer:732) INFO: 14epoch:train:8301-8400batch: iter_time=1.854, forward_time=0.165, loss_ctc=73.073, loss_att=53.289, acc=0.697, loss=59.224, backward_time=1.253, grad_norm=90.126, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.182, optim0_lr0=9.869e-05, train_time=7.868 +[gpub015:0/64] 2023-07-05 09:41:49,065 (trainer:732) INFO: 14epoch:train:8401-8500batch: iter_time=1.238e-04, forward_time=0.143, loss_ctc=74.007, loss_att=61.877, acc=0.673, loss=65.516, backward_time=1.249, grad_norm=104.208, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.865e-05, train_time=3.309 +[gpub015:0/64] 2023-07-05 09:44:26,311 (trainer:732) INFO: 14epoch:train:8501-8600batch: iter_time=1.224e-04, forward_time=0.144, loss_ctc=72.420, loss_att=54.139, acc=0.689, loss=59.623, backward_time=1.241, grad_norm=86.207, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.861e-05, train_time=3.145 +[gpub015:0/64] 2023-07-05 09:47:03,231 (trainer:732) INFO: 14epoch:train:8601-8700batch: iter_time=1.255e-04, forward_time=0.144, loss_ctc=63.298, loss_att=45.849, acc=0.702, loss=51.084, backward_time=1.241, grad_norm=78.605, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.857e-05, train_time=3.138 +[gpub015:0/64] 2023-07-05 09:49:40,176 (trainer:732) INFO: 14epoch:train:8701-8800batch: iter_time=1.186e-04, forward_time=0.144, loss_ctc=75.575, loss_att=55.981, acc=0.687, loss=61.859, backward_time=1.241, grad_norm=95.599, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.853e-05, train_time=3.139 +[gpub015:0/64] 2023-07-05 09:52:37,889 (trainer:732) INFO: 14epoch:train:8801-8900batch: iter_time=1.207e-04, forward_time=0.144, loss_ctc=65.438, loss_att=56.468, acc=0.674, loss=59.159, backward_time=1.391, grad_norm=78.677, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.849e-05, train_time=3.554 +[gpub015:0/64] 2023-07-05 09:55:59,096 (trainer:732) INFO: 14epoch:train:8901-9000batch: iter_time=1.066e-04, forward_time=0.144, loss_ctc=70.899, loss_att=58.105, acc=0.677, loss=61.943, backward_time=1.561, grad_norm=86.385, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.846e-05, train_time=4.024 +[gpub015:0/64] 2023-07-05 09:59:18,624 (trainer:732) INFO: 14epoch:train:9001-9100batch: iter_time=1.067e-04, forward_time=0.145, loss_ctc=81.516, loss_att=57.092, acc=0.700, loss=64.419, backward_time=1.548, grad_norm=122.948, clip=100.000, loss_scale=2.815e+14, optim_step_time=0.180, optim0_lr0=9.842e-05, train_time=3.990 +[gpub015:0/64] 2023-07-05 10:01:35,903 (multiple_iter_factory:32) INFO: Building 11th iter-factory... +[gpub015:0/64] 2023-07-05 10:01:53,829 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub015:0/64] 2023-07-05 10:01:57,524 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"} + preprocess: ) +[gpub015:0/64] 2023-07-05 10:01:57,525 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=37994, batch_size=128, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, +[gpub015:0/64] 2023-07-05 10:01:57,531 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=37994, mean=128.0, min=128, max=129 +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML + warnings.warn("Can't initialize NVML") +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1534, in all_reduce + work = default_pg.allreduce([tensor], opts) +RuntimeError: CUDA error: unknown error +CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect. +For debugging consider passing CUDA_LAUNCH_BLOCKING=1. +gpub078:4170392:4170392 [3] NCCL INFO comm 0x4f67f390 rank 47 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 6] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 5] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 13] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 12] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 15] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 14] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 4] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 22] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 21] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 20] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 23] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub037:1522725:1522810 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub037:1522724:1522808 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub032:3289604:3289696 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub032:3289605:3289693 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub032:3289606:3289626 [0] NCCL INFO comm 0x501cec20 rank 15 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub026:2433085:2433171 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub026:2433084:2433172 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub026:2433086:2433173 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub037:1522724:1522745 [0] NCCL INFO comm 0xab8ed350 rank 22 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub032:3289604:3289627 [0] NCCL INFO comm 0x50c34690 rank 13 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub032:3289605:3289624 [0] NCCL INFO comm 0xb6f8bc90 rank 14 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub026:2433085:2433106 [0] NCCL INFO comm 0xb7dab990 rank 5 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub037:1522725:1522747 [0] NCCL INFO comm 0x4f7df910 rank 23 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub026:2433084:2433108 [0] NCCL INFO comm 0x4fe36690 rank 4 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub026:2433086:2433109 [0] NCCL INFO comm 0xc27df910 rank 6 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub037:1522723:1522746 [0] NCCL INFO comm 0xba5d23a0 rank 21 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub037:1522722:1522811 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub037:1522722:1522744 [0] NCCL INFO comm 0x514cae40 rank 20 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 39] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 38] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub052:1901670:1901757 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +[W ProcessGroupNCCL.cpp:948] [Rank 36] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 37] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub052:1901670:1901690 [0] NCCL INFO comm 0xb6ced700 rank 39 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub052:1901669:1901759 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub052:1901668:1901758 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub052:1901668:1901689 [0] NCCL INFO comm 0x50134230 rank 37 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub052:1901669:1901688 [0] NCCL INFO comm 0x50c05250 rank 38 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub052:1901667:1901760 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub032:3289603:3289625 [0] NCCL INFO comm 0x9f95b40 rank 12 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub052:1901667:1901691 [0] NCCL INFO comm 0xbc2124a0 rank 36 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub080:4113203:4113295 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub080:4113203:4113203 [0] NCCL INFO comm 0xa21d7f0 rank 52 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub015:879783:879861 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub015:879783:879783 [3] NCCL INFO comm 0x5071eb50 rank 3 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +Process SpawnProcess-3: +gpub079:2657933:2658017 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 14] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 13] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub079:2657933:2657933 [1] NCCL INFO comm 0x8f776d0 rank 49 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-1: +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 36] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 6] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 15] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 23] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 12] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 5] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 51] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 50] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub079:2657934:2658014 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub079:2657935:2658016 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +[W ProcessGroupNCCL.cpp:948] [Rank 48] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 28] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 4] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 29] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +gpub079:2657935:2657956 [0] NCCL INFO comm 0x4edd83d0 rank 51 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 21] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 31] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 20] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 22] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub050:1879228:1879313 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub050:1879226:1879312 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +[W ProcessGroupNCCL.cpp:948] [Rank 30] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub050:1879227:1879310 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 47] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800494 milliseconds before timing out. +gpub079:2657934:2657955 [0] NCCL INFO comm 0x505ec9b0 rank 50 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub050:1879226:1879246 [0] NCCL INFO comm 0x50792660 rank 29 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub050:1879228:1879247 [0] NCCL INFO comm 0x5177dae0 rank 31 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub050:1879227:1879245 [0] NCCL INFO comm 0x50baa200 rank 30 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 38] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 33] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 32] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 34] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 35] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 39] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 37] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 47. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 52] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800348 milliseconds before timing out. +gpub051:2913626:2913716 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub050:1879225:1879311 [0] NCCL INFO [Service thread] Connection closed by localRank 0 +gpub051:2913625:2913713 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub051:2913626:2913649 [0] NCCL INFO comm 0x9e42a10 rank 35 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub050:1879225:1879248 [0] NCCL INFO comm 0xa81f9440 rank 28 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub051:2913625:2913648 [0] NCCL INFO comm 0xb9b5ccd0 rank 34 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub079:2657932:2657957 [0] NCCL INFO comm 0x8c890be0 rank 48 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub015:879781:879860 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub015:879781:879781 [1] NCCL INFO comm 0x8d09c1b0 rank 1 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 3] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800141 milliseconds before timing out. +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 49] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800195 milliseconds before timing out. +gpub051:2913623:2913647 [0] NCCL INFO comm 0x8dc3e980 rank 32 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Process SpawnProcess-4: +Traceback (most recent call last): +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 31] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 35] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 29] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub051:2913624:2913650 [0] NCCL INFO comm 0xbb329750 rank 33 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 28] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) +gpub081:2742228:2742325 [1] NCCL INFO [Service thread] Connection closed by localRank 1 + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 34] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub081:2742228:2742228 [1] NCCL INFO comm 0xb78a1250 rank 57 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 48] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 1] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800012 milliseconds before timing out. +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 30. Original reason for failure was: [Rank 30] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 51. Original reason for failure was: [Rank 51] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 32] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 50. Original reason for failure was: [Rank 50] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub081:2742227:2742227 [0] NCCL INFO comm 0x518b4950 rank 56 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 55] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 52. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 54] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 52. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 53] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 52. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 33] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 49. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub080:4113206:4113298 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub080:4113204:4113297 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub080:4113205:4113296 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub080:4113206:4113224 [0] NCCL INFO comm 0x8c72c2a0 rank 55 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub080:4113205:4113226 [0] NCCL INFO comm 0x50af0e00 rank 54 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub080:4113204:4113225 [0] NCCL INFO comm 0xb71b4bf0 rank 53 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 57] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800134 milliseconds before timing out. +gpub078:4170389:4170389 [0] NCCL INFO comm 0x4f656710 rank 44 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 55] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 52. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 56] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800687 milliseconds before timing out. +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 54] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 52. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 44] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800885 milliseconds before timing out. +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 53] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 52. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 61] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 60] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 63] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 62] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub015:879780:879780 [0] NCCL INFO comm 0x51871d20 rank 0 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 24] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 27] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 26] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 25] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub082:1518446:1518535 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub082:1518446:1518467 [0] NCCL INFO comm 0xb6caaae0 rank 61 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub082:1518447:1518469 [0] NCCL INFO comm 0xb6376a90 rank 62 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub082:1518448:1518468 [0] NCCL INFO comm 0x8c5b6f90 rank 63 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub049:4064877:4064950 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub049:4064875:4064949 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub049:4064877:4064897 [0] NCCL INFO comm 0x4f5c00a0 rank 27 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub049:4064875:4064894 [0] NCCL INFO comm 0xa8769be0 rank 25 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub049:4064876:4064896 [0] NCCL INFO comm 0xb89777d0 rank 26 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub049:4064874:4064895 [0] NCCL INFO comm 0x500f4c60 rank 24 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 59] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 58] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub081:2742229:2742324 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub081:2742230:2742253 [0] NCCL INFO comm 0xba992be0 rank 59 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub081:2742229:2742252 [0] NCCL INFO comm 0x50f92c00 rank 58 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 61] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub082:1518445:1518470 [0] NCCL INFO comm 0x519aa9d0 rank 60 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 0] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1801161 milliseconds before timing out. +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 25] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 24] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 27] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 26] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 62. Original reason for failure was: [Rank 62] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 63. Original reason for failure was: [Rank 63] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 60] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 44. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 59] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 0. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 58] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 7] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 9] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 10] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub031:1921205:1921294 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +[W ProcessGroupNCCL.cpp:948] [Rank 8] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub026:2433087:2433174 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub026:2433087:2433107 [0] NCCL INFO comm 0x50347080 rank 7 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub031:1921205:1921228 [0] NCCL INFO comm 0x92a3a80 rank 9 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub031:1921206:1921295 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub031:1921206:1921230 [0] NCCL INFO comm 0xc2e65190 rank 10 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub053:1664487:1664568 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub053:1664487:1664487 [1] NCCL INFO comm 0x506110d0 rank 41 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub031:1921204:1921227 [0] NCCL INFO comm 0xb63f1750 rank 8 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 9] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 7] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 10] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 8] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 57. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 17] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 18] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub036:1870498:1870586 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub036:1870498:1870518 [0] NCCL INFO comm 0x50c66a10 rank 18 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +[W ProcessGroupNCCL.cpp:948] [Rank 16] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 19] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub036:1870499:1870587 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub036:1870499:1870519 [0] NCCL INFO comm 0xa269c50 rank 19 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub036:1870497:1870521 [0] NCCL INFO comm 0x4fcaadc0 rank 17 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: [Rank 41] Caught collective operation timeout: WorkNCCL(SeqNum=3865712, OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800142 milliseconds before timing out. +[W ProcessGroupNCCL.cpp:948] [Rank 2] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub015:879782:879859 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub015:879782:879802 [0] NCCL INFO comm 0x502ad7c0 rank 2 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 17] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 42] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 40] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 43] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 46] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +[W ProcessGroupNCCL.cpp:948] [Rank 45] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub053:1664489:1664567 [3] NCCL INFO [Service thread] Connection closed by localRank 3 +gpub053:1664488:1664566 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 18] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub053:1664489:1664509 [0] NCCL INFO comm 0xa9e28fe0 rank 43 nranks 64 cudaDev 3 busId c7000 - Abort COMPLETE +gpub036:1870496:1870520 [0] NCCL INFO comm 0xad17bd0 rank 16 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +gpub053:1664488:1664511 [0] NCCL INFO comm 0xe3027a0 rank 42 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +gpub078:4170391:4170477 [2] NCCL INFO [Service thread] Connection closed by localRank 2 +gpub078:4170390:4170479 [1] NCCL INFO [Service thread] Connection closed by localRank 1 +gpub078:4170390:4170413 [0] NCCL INFO comm 0x1d97f440 rank 45 nranks 64 cudaDev 1 busId 46000 - Abort COMPLETE +gpub078:4170391:4170416 [0] NCCL INFO comm 0x5187a990 rank 46 nranks 64 cudaDev 2 busId 85000 - Abort COMPLETE +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator was aborted on rank 19. Original reason for failure was: [Rank 19] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 2] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 16] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +gpub053:1664486:1664508 [0] NCCL INFO comm 0x4f7ecd60 rank 40 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 42] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 46] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-2: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 45] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-4: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl + return forward_call(*input, **kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1034, in forward + self._sync_buffers() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers + self._sync_module_buffers(authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers + self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced + self._distributed_broadcast_coalesced( + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced + dist._broadcast_coalesced( +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 43] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Process SpawnProcess-1: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 516, in train_one_epoch + torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1541, in all_reduce + work.wait() +RuntimeError: NCCL communicator encountered error set by ProcessGroupNCCL: [Rank 40] Found key in store: NCCLABORTEDCOMM:20b41dac1c1773000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, from rank: 41. This means that rank has aborted its NCCL communicators previously and is not in a healthy state.. Aborting appropriate communicators +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + main() + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + S2TTask.main(cmd=cmd) + S2TTask.main(cmd=cmd) + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + while not ProcessContext(processes, error_queues).join(): + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + while not ProcessContext(processes, error_queues).join(): + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( + S2TTask.main(cmd=cmd) + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 + raise ProcessExitedException( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + raise ProcessExitedException( + return _run_code(code, main_globals, None, +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1 + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + raise ProcessExitedException( + S2TTask.main(cmd=cmd) +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + main() + raise ProcessExitedException( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 +srun: error: gpub052: task 9: Exited with exit code 1 +srun: error: gpub026: task 1: Exited with exit code 1 +srun: error: gpub080: task 13: Exited with exit code 1 +srun: error: gpub079: task 12: Exited with exit code 1 +srun: error: gpub015: task 0: Exited with exit code 1 +srun: error: gpub081: task 14: Exited with exit code 1 +srun: error: gpub082: task 15: Exited with exit code 1 +srun: error: gpub032: task 3: Exited with exit code 1 +srun: error: gpub050: task 7: Exited with exit code 1 +srun: error: gpub037: task 5: Exited with exit code 1 +srun: error: gpub049: task 6: Exited with exit code 1 +srun: error: gpub078: task 11: Exited with exit code 1 +srun: error: gpub051: task 8: Exited with exit code 1 +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1 +srun: error: gpub036: task 4: Exited with exit code 1 +srun: error: gpub053: task 10: Exited with exit code 1 +srun: Job step aborted: Waiting up to 32 seconds for job step to finish.