diff --git "a/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.12.log" "b/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.12.log" new file mode 100644--- /dev/null +++ "b/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.12.log" @@ -0,0 +1,4556 @@ +# Running on gpub001.delta.ncsa.illinois.edu +# Started at Sun Jul 2 01:35:09 CDT 2023 +# SLURMD_NODENAME=gpub001 +# SLURM_CLUSTER_NAME=delta +# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf +# SLURM_CPUS_ON_NODE=64 +# SLURM_CPUS_PER_TASK=64 +# SLURM_EXPORT_ENV=PATH +# SLURM_GET_USER_ENV=1 +# SLURM_GPUS_ON_NODE=4 +# SLURM_GTIDS=0 +# SLURM_JOBID=2115302 +# SLURM_JOB_ACCOUNT=bbjs-delta-gpu +# SLURM_JOB_CPUS_PER_NODE='64(x32)' +# SLURM_JOB_GID=202 +# SLURM_JOB_GPUS=0,1,2,3 +# SLURM_JOB_ID=2115302 +# SLURM_JOB_NAME=exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/train.log +# SLURM_JOB_NODELIST='gpub[001,009,011-016,031-032,035,037-041,058-059,061,064-068,075,080,083,085,088-091]' +# SLURM_JOB_NUM_NODES=32 +# SLURM_JOB_PARTITION=gpuA40x4 +# SLURM_JOB_QOS=bbjs-delta-gpu +# SLURM_JOB_UID=68077 +# SLURM_JOB_USER=peng6 +# SLURM_LOCALID=0 +# SLURM_MEM_PER_NODE=240000 +# SLURM_NNODES=32 +# SLURM_NODEID=0 +# SLURM_NODELIST='gpub[001,009,011-016,031-032,035,037-041,058-059,061,064-068,075,080,083,085,088-091]' +# SLURM_NODE_ALIASES='(null)' +# SLURM_OPEN_MODE=a +# SLURM_PRIO_PROCESS=0 +# SLURM_PROCID=0 +# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1 +# SLURM_SUBMIT_HOST=dt-login02.delta.internal.ncsa.edu +# SLURM_TASKS_PER_NODE='1(x32)' +# SLURM_TASK_PID=279842 +# SLURM_TOPOLOGY_ADDR=ss00.ss09.gpub001 +# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node +# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9728:109 +# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fol/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000 --config conf/train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distrd_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +ibuted true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +d_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits10/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits10/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v3/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/.dist_init_75cd0950-5140-438e-8876-03978c5bec75 +[gpub001:0/128] 2023-07-02 01:39:06,588 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[gpub001:0/128] 2023-07-02 01:39:08,180 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 128 nodes. +[gpub001:0/128] 2023-07-02 01:39:08,214 (s2t:483) INFO: Vocabulary size: 50002 +[gpub001:0/128] 2023-07-02 01:39:28,889 (abs_task:1201) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True +[gpub001:0/128] 2023-07-02 01:39:28,898 (abs_task:1202) INFO: Model structure: +ESPnetS2TModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (specaug): SpecAug( + (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) + (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) + ) + (normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): TransformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=19456, out_features=1024, bias=True) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (12): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (13): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (14): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (15): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (16): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (17): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (18): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (19): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (20): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (21): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (22): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (23): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(50002, 1024) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=1024, out_features=50002, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (12): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (13): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (14): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (15): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (16): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (17): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (18): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (19): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (20): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (21): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (22): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (23): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) + (ctc): CTC( + (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) + (ctc_loss): CTCLoss() + ) +) + +Model summary: + Class Name: ESPnetS2TModel + Total Number of model parameters: 888.51 M + Number of trainable parameters: 888.51 M (100.0%) + Size: 3.55 GB + Type: torch.float32 +[gpub001:0/128] 2023-07-02 01:39:28,898 (abs_task:1205) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.9, 0.98] + capturable: False + eps: 1e-06 + foreach: None + initial_lr: 0.00025 + lr: 2.5e-08 + maximize: False + weight_decay: 0.0 +) +[gpub001:0/128] 2023-07-02 01:39:28,898 (abs_task:1206) INFO: Scheduler: WarmupLR(warmup_steps=10000) +[gpub001:0/128] 2023-07-02 01:39:28,899 (abs_task:1215) INFO: Saving the configuration in exp/s2t_train_s2t_transformer_conv2d_size1024_e24_d24_lr2.5e-4_warmup10k_finetune_raw_bpe50000/config.yaml +[gpub001:0/128] 2023-07-02 01:39:29,583 (abs_task:1272) INFO: Loading pretrained params from /scratch/bbjs/peng6/espnet-whisper-public/egs2/mixed_v2/s2t1/exp/s2t_train_s2t_transformer_conv2d_size1024_e18_d18_lr5e-4_warmup20k_raw_bpe50000/valid.acc.ave.pth +[gpub001:0/128] 2023-07-02 01:39:42,237 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 01:39:42,389 (abs_task:1570) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev/text", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 01:39:42,389 (abs_task:1571) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=506, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpub001:0/128] 2023-07-02 01:39:42,389 (abs_task:1572) INFO: [valid] mini-batch sizes summary: N-batch=506, mean=256.1, min=256, max=257 +[gpub001:0/128] 2023-07-02 01:39:42,865 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 01:39:43,172 (abs_task:1570) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev/text", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 01:39:43,172 (abs_task:1571) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=129591, batch_size=1, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpub001:0/128] 2023-07-02 01:39:43,173 (abs_task:1572) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +gpub001:279948:279948 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:279948:279948 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:279948:279948 [0] NCCL INFO cudaDriverVersion 12010 +NCCL version 2.14.3+cuda11.7 +[gpub001:0/128] 2023-07-02 01:39:49,879 (trainer:284) INFO: 1/100epoch started +[gpub001:0/128] 2023-07-02 01:39:49,937 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/128] 2023-07-02 01:40:11,535 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 01:40:15,587 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 01:40:15,588 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/128] 2023-07-02 01:40:15,591 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +gpub012:1262644:1262644 [0] NCCL INFO cudaDriverVersion 12010 +gpub012:1262644:1262644 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.112<0> +gpub012:1262644:1262644 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub012:1262644:1262706 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.112<0> +gpub012:1262644:1262706 [0] NCCL INFO Using network IB +gpub012:1262644:1262706 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub012:1262644:1262706 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] 13/4/-1->12->28 +gpub012:1262644:1262706 [0] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 00/0 : 12[7000] -> 13[46000] via P2P/IPC +gpub012:1262644:1262706 [0] NCCL INFO Channel 01/0 : 12[7000] -> 13[46000] via P2P/IPC +gpub012:1262644:1262706 [0] NCCL INFO Connected all rings +gpub012:1262644:1262706 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [receive] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [receive] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [send] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [receive] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [send] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [send] via NET/IB/0 +gpub012:1262644:1262706 [0] NCCL INFO Connected all trees +gpub012:1262644:1262706 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub012:1262644:1262706 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub012:1262644:1262706 [0] NCCL INFO comm 0xb67ef980 rank 12 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub067:1289107:1289107 [0] NCCL INFO cudaDriverVersion 12010 +gpub067:1289107:1289107 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1289107:1289107 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1289107:1289169 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1289107:1289169 [0] NCCL INFO Using network IB +gpub067:1289107:1289169 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub067:1289107:1289169 [0] NCCL INFO Trees [0] 89/92/-1->88->80 [1] 89/-1/-1->88->85 +gpub067:1289107:1289169 [0] NCCL INFO Channel 00/0 : 87[c7000] -> 88[7000] [receive] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 01/0 : 87[c7000] -> 88[7000] [receive] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 00/0 : 88[7000] -> 89[46000] via P2P/IPC +gpub067:1289107:1289169 [0] NCCL INFO Channel 01/0 : 88[7000] -> 89[46000] via P2P/IPC +gpub067:1289107:1289169 [0] NCCL INFO Connected all rings +gpub014:1242932:1242932 [2] NCCL INFO cudaDriverVersion 12010 +gpub014:1242932:1242932 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.114<0> +gpub014:1242932:1242932 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub014:1242932:1242996 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.114<0> +gpub014:1242932:1242996 [2] NCCL INFO Using network IB +gpub014:1242932:1242996 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub014:1242932:1242996 [2] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 +gpub014:1242932:1242996 [2] NCCL INFO Channel 00/0 : 22[85000] -> 23[c7000] via P2P/IPC +gpub014:1242932:1242996 [2] NCCL INFO Channel 01/0 : 22[85000] -> 23[c7000] via P2P/IPC +gpub014:1242932:1242996 [2] NCCL INFO Connected all rings +gpub014:1242932:1242996 [2] NCCL INFO Channel 00/0 : 22[85000] -> 21[46000] via P2P/IPC +gpub014:1242932:1242996 [2] NCCL INFO Channel 01/0 : 22[85000] -> 21[46000] via P2P/IPC +gpub064:1376670:1376670 [1] NCCL INFO cudaDriverVersion 12010 +gpub064:1376670:1376670 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.164<0> +gpub064:1376670:1376670 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub064:1376670:1376736 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.164<0> +gpub064:1376670:1376736 [1] NCCL INFO Using network IB +gpub064:1376670:1376736 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub064:1376670:1376736 [1] NCCL INFO Trees [0] 78/-1/-1->77->76 [1] 78/84/-1->77->76 +gpub064:1376670:1376736 [1] NCCL INFO Channel 00/0 : 77[46000] -> 78[85000] via P2P/IPC +gpub064:1376670:1376736 [1] NCCL INFO Channel 01/0 : 77[46000] -> 78[85000] via P2P/IPC +gpub064:1376670:1376736 [1] NCCL INFO Connected all rings +gpub064:1376670:1376736 [1] NCCL INFO Channel 01/0 : 77[46000] -> 84[7000] [send] via NET/IB/0 +gpub064:1376670:1376736 [1] NCCL INFO Channel 01/0 : 84[7000] -> 77[46000] [receive] via NET/IB/0 +gpub037:1358540:1358540 [2] NCCL INFO cudaDriverVersion 12010 +gpub037:1358540:1358540 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1358540:1358540 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1358540:1358598 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1358540:1358598 [2] NCCL INFO Using network IB +gpub037:1358540:1358598 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub037:1358540:1358598 [2] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 +gpub037:1358540:1358598 [2] NCCL INFO Channel 00/0 : 46[85000] -> 47[c7000] via P2P/IPC +gpub037:1358540:1358598 [2] NCCL INFO Channel 01/0 : 46[85000] -> 47[c7000] via P2P/IPC +gpub037:1358540:1358598 [2] NCCL INFO Connected all rings +gpub037:1358540:1358598 [2] NCCL INFO Channel 00/0 : 46[85000] -> 45[46000] via P2P/IPC +gpub037:1358540:1358598 [2] NCCL INFO Channel 01/0 : 46[85000] -> 45[46000] via P2P/IPC +gpub012:1262647:1262647 [3] NCCL INFO cudaDriverVersion 12010 +gpub012:1262647:1262647 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.112<0> +gpub012:1262647:1262647 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub012:1262647:1262709 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.112<0> +gpub012:1262647:1262709 [3] NCCL INFO Using network IB +gpub012:1262647:1262709 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub012:1262647:1262709 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] -1/-1/-1->15->14 +gpub012:1262647:1262709 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0 +gpub012:1262647:1262709 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0 +gpub012:1262647:1262709 [3] NCCL INFO Connected all rings +gpub012:1262647:1262709 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 14[85000] via P2P/IPC +gpub012:1262647:1262709 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 14[85000] via P2P/IPC +gpub032:2709149:2709149 [1] NCCL INFO cudaDriverVersion 12010 +gpub032:2709149:2709149 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:2709149:2709149 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:2709149:2709215 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:2709149:2709215 [1] NCCL INFO Using network IB +gpub032:2709149:2709215 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub032:2709149:2709215 [1] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/40/-1->37->36 +gpub032:2709149:2709215 [1] NCCL INFO Channel 00/0 : 37[46000] -> 38[85000] via P2P/IPC +gpub032:2709149:2709215 [1] NCCL INFO Channel 01/0 : 37[46000] -> 38[85000] via P2P/IPC +gpub032:2709149:2709215 [1] NCCL INFO Connected all rings +gpub032:2709149:2709215 [1] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [send] via NET/IB/0 +gpub032:2709149:2709215 [1] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [receive] via NET/IB/0 +gpub089:1443526:1443526 [0] NCCL INFO cudaDriverVersion 12010 +gpub089:1443526:1443526 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.189<0> +gpub089:1443526:1443526 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub089:1443526:1443886 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.189<0> +gpub089:1443526:1443886 [0] NCCL INFO Using network IB +gpub089:1443526:1443886 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub089:1443526:1443886 [0] NCCL INFO Trees [0] 117/-1/-1->116->121 [1] 117/112/-1->116->109 +gpub089:1443526:1443886 [0] NCCL INFO Channel 00/0 : 115[c7000] -> 116[7000] [receive] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 01/0 : 115[c7000] -> 116[7000] [receive] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 00/0 : 116[7000] -> 117[46000] via P2P/IPC +gpub089:1443526:1443886 [0] NCCL INFO Channel 01/0 : 116[7000] -> 117[46000] via P2P/IPC +gpub009:1313739:1313739 [0] NCCL INFO cudaDriverVersion 12010 +gpub009:1313739:1313739 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.109<0> +gpub009:1313739:1313739 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub009:1313739:1313796 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.109<0> +gpub009:1313739:1313796 [0] NCCL INFO Using network IB +gpub009:1313739:1313796 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub009:1313739:1313796 [0] NCCL INFO Trees [0] 5/-1/-1->4->9 [1] 5/0/-1->4->12 +gpub009:1313739:1313796 [0] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 00/0 : 4[7000] -> 5[46000] via P2P/IPC +gpub009:1313739:1313796 [0] NCCL INFO Channel 01/0 : 4[7000] -> 5[46000] via P2P/IPC +gpub009:1313739:1313796 [0] NCCL INFO Connected all rings +gpub039:1773546:1773546 [1] NCCL INFO cudaDriverVersion 12010 +gpub039:1773546:1773546 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.139<0> +gpub039:1773546:1773546 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub039:1773546:1773609 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.139<0> +gpub039:1773546:1773609 [1] NCCL INFO Using network IB +gpub039:1773546:1773609 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub039:1773546:1773609 [1] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/56/-1->53->52 +gpub039:1773546:1773609 [1] NCCL INFO Channel 00/0 : 53[46000] -> 54[85000] via P2P/IPC +gpub039:1773546:1773609 [1] NCCL INFO Channel 01/0 : 53[46000] -> 54[85000] via P2P/IPC +gpub039:1773546:1773609 [1] NCCL INFO Connected all rings +gpub039:1773546:1773609 [1] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [send] via NET/IB/0 +gpub039:1773546:1773609 [1] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [receive] via NET/IB/0 +gpub059:1722911:1722911 [2] NCCL INFO cudaDriverVersion 12010 +gpub059:1722911:1722911 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1722911:1722911 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1722911:1722975 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1722911:1722975 [2] NCCL INFO Using network IB +gpub059:1722911:1722975 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub059:1722911:1722975 [2] NCCL INFO Trees [0] 71/-1/-1->70->69 [1] 71/-1/-1->70->69 +gpub059:1722911:1722975 [2] NCCL INFO Channel 00/0 : 70[85000] -> 71[c7000] via P2P/IPC +gpub059:1722911:1722975 [2] NCCL INFO Channel 01/0 : 70[85000] -> 71[c7000] via P2P/IPC +gpub059:1722911:1722975 [2] NCCL INFO Connected all rings +gpub059:1722911:1722975 [2] NCCL INFO Channel 00/0 : 70[85000] -> 69[46000] via P2P/IPC +gpub059:1722911:1722975 [2] NCCL INFO Channel 01/0 : 70[85000] -> 69[46000] via P2P/IPC +gpub067:1289107:1289169 [0] NCCL INFO Channel 01/0 : 85[46000] -> 88[7000] [receive] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 00/0 : 88[7000] -> 92[7000] [send] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 00/0 : 80[7000] -> 88[7000] [receive] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 00/0 : 88[7000] -> 80[7000] [send] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 00/0 : 92[7000] -> 88[7000] [receive] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Channel 01/0 : 88[7000] -> 85[46000] [send] via NET/IB/0 +gpub067:1289107:1289169 [0] NCCL INFO Connected all trees +gpub067:1289107:1289169 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub067:1289107:1289169 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1289107:1289169 [0] NCCL INFO comm 0x4f28f210 rank 88 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub014:1242932:1242996 [2] NCCL INFO Connected all trees +gpub014:1242932:1242996 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub014:1242932:1242996 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub014:1242932:1242996 [2] NCCL INFO comm 0x51262ad0 rank 22 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub064:1376670:1376736 [1] NCCL INFO Channel 00/0 : 77[46000] -> 76[7000] via P2P/IPC +gpub064:1376670:1376736 [1] NCCL INFO Channel 01/0 : 77[46000] -> 76[7000] via P2P/IPC +gpub064:1376670:1376736 [1] NCCL INFO Connected all trees +gpub064:1376670:1376736 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub064:1376670:1376736 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub064:1376670:1376736 [1] NCCL INFO comm 0x504a2c90 rank 77 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub037:1358540:1358598 [2] NCCL INFO Connected all trees +gpub037:1358540:1358598 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub037:1358540:1358598 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1358540:1358598 [2] NCCL INFO comm 0x8ec6490 rank 46 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub037:1358541:1358541 [3] NCCL INFO cudaDriverVersion 12010 +gpub037:1358541:1358541 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1358541:1358541 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1358541:1358597 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1358541:1358597 [3] NCCL INFO Using network IB +gpub037:1358541:1358597 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub037:1358541:1358597 [3] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] -1/-1/-1->47->46 +gpub037:1358541:1358597 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0 +gpub012:1262647:1262709 [3] NCCL INFO Connected all trees +gpub012:1262647:1262709 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub012:1262647:1262709 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub012:1262647:1262709 [3] NCCL INFO comm 0xa9d6d20 rank 15 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub032:2709149:2709215 [1] NCCL INFO Channel 00/0 : 37[46000] -> 36[7000] via P2P/IPC +gpub032:2709149:2709215 [1] NCCL INFO Channel 01/0 : 37[46000] -> 36[7000] via P2P/IPC +gpub032:2709149:2709215 [1] NCCL INFO Connected all trees +gpub032:2709149:2709215 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub032:2709149:2709215 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:2709149:2709215 [1] NCCL INFO comm 0x4f9a7620 rank 37 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub089:1443526:1443886 [0] NCCL INFO Connected all rings +gpub089:1443526:1443886 [0] NCCL INFO Channel 01/0 : 112[7000] -> 116[7000] [receive] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 00/0 : 116[7000] -> 121[46000] [send] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 01/0 : 109[46000] -> 116[7000] [receive] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 01/0 : 116[7000] -> 109[46000] [send] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 00/0 : 121[46000] -> 116[7000] [receive] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Channel 01/0 : 116[7000] -> 112[7000] [send] via NET/IB/0 +gpub089:1443526:1443886 [0] NCCL INFO Connected all trees +gpub089:1443526:1443886 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub089:1443526:1443886 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub089:1443526:1443886 [0] NCCL INFO comm 0x9a37990 rank 116 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub009:1313739:1313796 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [receive] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [send] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [send] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [receive] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [receive] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [send] via NET/IB/0 +gpub009:1313739:1313796 [0] NCCL INFO Connected all trees +gpub009:1313739:1313796 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub009:1313739:1313796 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub009:1313739:1313796 [0] NCCL INFO comm 0xd47f4d80 rank 4 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub039:1773546:1773609 [1] NCCL INFO Channel 00/0 : 53[46000] -> 52[7000] via P2P/IPC +gpub039:1773546:1773609 [1] NCCL INFO Channel 01/0 : 53[46000] -> 52[7000] via P2P/IPC +gpub039:1773546:1773609 [1] NCCL INFO Connected all trees +gpub039:1773546:1773609 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub039:1773546:1773609 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub039:1773546:1773609 [1] NCCL INFO comm 0x51566170 rank 53 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub059:1722911:1722975 [2] NCCL INFO Connected all trees +gpub059:1722911:1722975 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub059:1722911:1722975 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1722911:1722975 [2] NCCL INFO comm 0x50aa6ce0 rank 70 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub067:1289110:1289110 [3] NCCL INFO cudaDriverVersion 12010 +gpub067:1289110:1289110 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1289110:1289110 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1289110:1289167 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1289110:1289167 [3] NCCL INFO Using network IB +gpub067:1289110:1289167 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub067:1289110:1289167 [3] NCCL INFO Trees [0] -1/-1/-1->91->90 [1] -1/-1/-1->91->90 +gpub067:1289110:1289167 [3] NCCL INFO Channel 00/0 : 91[c7000] -> 92[7000] [send] via NET/IB/0 +gpub067:1289110:1289167 [3] NCCL INFO Channel 01/0 : 91[c7000] -> 92[7000] [send] via NET/IB/0 +gpub067:1289110:1289167 [3] NCCL INFO Connected all rings +gpub067:1289110:1289167 [3] NCCL INFO Channel 00/0 : 91[c7000] -> 90[85000] via P2P/IPC +gpub067:1289110:1289167 [3] NCCL INFO Channel 01/0 : 91[c7000] -> 90[85000] via P2P/IPC +gpub014:1242930:1242930 [0] NCCL INFO cudaDriverVersion 12010 +gpub014:1242930:1242930 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.114<0> +gpub014:1242930:1242930 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub014:1242930:1242994 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.114<0> +gpub014:1242930:1242994 [0] NCCL INFO Using network IB +gpub014:1242930:1242994 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub014:1242930:1242994 [0] NCCL INFO Trees [0] 21/-1/-1->20->25 [1] 21/16/-1->20->13 +gpub014:1242930:1242994 [0] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 00/0 : 20[7000] -> 21[46000] via P2P/IPC +gpub014:1242930:1242994 [0] NCCL INFO Channel 01/0 : 20[7000] -> 21[46000] via P2P/IPC +gpub014:1242930:1242994 [0] NCCL INFO Connected all rings +gpub064:1376671:1376671 [2] NCCL INFO cudaDriverVersion 12010 +gpub064:1376671:1376671 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.164<0> +gpub064:1376671:1376671 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub064:1376671:1376737 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.164<0> +gpub064:1376671:1376737 [2] NCCL INFO Using network IB +gpub064:1376671:1376737 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub064:1376671:1376737 [2] NCCL INFO Trees [0] 79/-1/-1->78->77 [1] 79/-1/-1->78->77 +gpub064:1376671:1376737 [2] NCCL INFO Channel 00/0 : 78[85000] -> 79[c7000] via P2P/IPC +gpub064:1376671:1376737 [2] NCCL INFO Channel 01/0 : 78[85000] -> 79[c7000] via P2P/IPC +gpub064:1376671:1376737 [2] NCCL INFO Connected all rings +gpub064:1376671:1376737 [2] NCCL INFO Channel 00/0 : 78[85000] -> 77[46000] via P2P/IPC +gpub064:1376671:1376737 [2] NCCL INFO Channel 01/0 : 78[85000] -> 77[46000] via P2P/IPC +gpub037:1358541:1358597 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0 +gpub037:1358541:1358597 [3] NCCL INFO Connected all rings +gpub037:1358541:1358597 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 46[85000] via P2P/IPC +gpub037:1358541:1358597 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 46[85000] via P2P/IPC +gpub037:1358541:1358597 [3] NCCL INFO Connected all trees +gpub037:1358541:1358597 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub037:1358541:1358597 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1358541:1358597 [3] NCCL INFO comm 0x9c2bba0 rank 47 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub012:1262646:1262646 [2] NCCL INFO cudaDriverVersion 12010 +gpub012:1262646:1262646 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.112<0> +gpub012:1262646:1262646 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub012:1262646:1262708 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.112<0> +gpub012:1262646:1262708 [2] NCCL INFO Using network IB +gpub012:1262646:1262708 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub012:1262646:1262708 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 +gpub012:1262646:1262708 [2] NCCL INFO Channel 00/0 : 14[85000] -> 15[c7000] via P2P/IPC +gpub012:1262646:1262708 [2] NCCL INFO Channel 01/0 : 14[85000] -> 15[c7000] via P2P/IPC +gpub012:1262646:1262708 [2] NCCL INFO Connected all rings +gpub012:1262646:1262708 [2] NCCL INFO Channel 00/0 : 14[85000] -> 13[46000] via P2P/IPC +gpub012:1262646:1262708 [2] NCCL INFO Channel 01/0 : 14[85000] -> 13[46000] via P2P/IPC +gpub032:2709150:2709150 [2] NCCL INFO cudaDriverVersion 12010 +gpub032:2709150:2709150 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:2709150:2709150 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:2709150:2709213 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:2709150:2709213 [2] NCCL INFO Using network IB +gpub032:2709150:2709213 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub032:2709150:2709213 [2] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 +gpub032:2709150:2709213 [2] NCCL INFO Channel 00/0 : 38[85000] -> 39[c7000] via P2P/IPC +gpub032:2709150:2709213 [2] NCCL INFO Channel 01/0 : 38[85000] -> 39[c7000] via P2P/IPC +gpub032:2709150:2709213 [2] NCCL INFO Connected all rings +gpub032:2709150:2709213 [2] NCCL INFO Channel 00/0 : 38[85000] -> 37[46000] via P2P/IPC +gpub032:2709150:2709213 [2] NCCL INFO Channel 01/0 : 38[85000] -> 37[46000] via P2P/IPC +gpub089:1443527:1443527 [1] NCCL INFO cudaDriverVersion 12010 +gpub089:1443527:1443527 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.189<0> +gpub089:1443527:1443527 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub089:1443527:1443889 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.189<0> +gpub089:1443527:1443889 [1] NCCL INFO Using network IB +gpub089:1443527:1443889 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub089:1443527:1443889 [1] NCCL INFO Trees [0] 118/-1/-1->117->116 [1] 118/120/-1->117->116 +gpub089:1443527:1443889 [1] NCCL INFO Channel 00/0 : 117[46000] -> 118[85000] via P2P/IPC +gpub089:1443527:1443889 [1] NCCL INFO Channel 01/0 : 117[46000] -> 118[85000] via P2P/IPC +gpub089:1443527:1443889 [1] NCCL INFO Connected all rings +gpub089:1443527:1443889 [1] NCCL INFO Channel 01/0 : 117[46000] -> 120[7000] [send] via NET/IB/0 +gpub089:1443527:1443889 [1] NCCL INFO Channel 01/0 : 120[7000] -> 117[46000] [receive] via NET/IB/0 +gpub009:1313741:1313741 [2] NCCL INFO cudaDriverVersion 12010 +gpub009:1313741:1313741 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.109<0> +gpub009:1313741:1313741 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub009:1313741:1313799 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.109<0> +gpub009:1313741:1313799 [2] NCCL INFO Using network IB +gpub009:1313741:1313799 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub009:1313741:1313799 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +gpub009:1313741:1313799 [2] NCCL INFO Channel 00/0 : 6[85000] -> 7[c7000] via P2P/IPC +gpub009:1313741:1313799 [2] NCCL INFO Channel 01/0 : 6[85000] -> 7[c7000] via P2P/IPC +gpub009:1313741:1313799 [2] NCCL INFO Connected all rings +gpub009:1313741:1313799 [2] NCCL INFO Channel 00/0 : 6[85000] -> 5[46000] via P2P/IPC +gpub009:1313741:1313799 [2] NCCL INFO Channel 01/0 : 6[85000] -> 5[46000] via P2P/IPC +gpub039:1773548:1773548 [3] NCCL INFO cudaDriverVersion 12010 +gpub039:1773548:1773548 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.139<0> +gpub039:1773548:1773548 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub039:1773548:1773612 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.139<0> +gpub039:1773548:1773612 [3] NCCL INFO Using network IB +gpub039:1773548:1773612 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub039:1773548:1773612 [3] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] -1/-1/-1->55->54 +gpub039:1773548:1773612 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0 +gpub039:1773548:1773612 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0 +gpub039:1773548:1773612 [3] NCCL INFO Connected all rings +gpub039:1773548:1773612 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 54[85000] via P2P/IPC +gpub039:1773548:1773612 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 54[85000] via P2P/IPC +gpub067:1289110:1289167 [3] NCCL INFO Connected all trees +gpub067:1289110:1289167 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub067:1289110:1289167 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1289110:1289167 [3] NCCL INFO comm 0x50179280 rank 91 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub014:1242930:1242994 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [receive] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [send] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [receive] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [send] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [receive] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [send] via NET/IB/0 +gpub014:1242930:1242994 [0] NCCL INFO Connected all trees +gpub014:1242930:1242994 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub014:1242930:1242994 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub014:1242930:1242994 [0] NCCL INFO comm 0x9cc906c0 rank 20 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub064:1376671:1376737 [2] NCCL INFO Connected all trees +gpub064:1376671:1376737 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub064:1376671:1376737 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub064:1376671:1376737 [2] NCCL INFO comm 0x8cf5dc0 rank 78 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub037:1358539:1358539 [1] NCCL INFO cudaDriverVersion 12010 +gpub037:1358539:1358539 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1358539:1358539 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1358539:1358600 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1358539:1358600 [1] NCCL INFO Using network IB +gpub037:1358539:1358600 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub037:1358539:1358600 [1] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/52/-1->45->44 +gpub037:1358539:1358600 [1] NCCL INFO Channel 00/0 : 45[46000] -> 46[85000] via P2P/IPC +gpub037:1358539:1358600 [1] NCCL INFO Channel 01/0 : 45[46000] -> 46[85000] via P2P/IPC +gpub037:1358539:1358600 [1] NCCL INFO Connected all rings +gpub037:1358539:1358600 [1] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [send] via NET/IB/0 +gpub037:1358539:1358600 [1] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [receive] via NET/IB/0 +gpub012:1262646:1262708 [2] NCCL INFO Connected all trees +gpub012:1262646:1262708 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub012:1262646:1262708 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub012:1262646:1262708 [2] NCCL INFO comm 0xa94349a0 rank 14 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub032:2709150:2709213 [2] NCCL INFO Connected all trees +gpub032:2709150:2709213 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub032:2709150:2709213 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:2709150:2709213 [2] NCCL INFO comm 0x50422250 rank 38 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub089:1443527:1443889 [1] NCCL INFO Channel 00/0 : 117[46000] -> 116[7000] via P2P/IPC +gpub089:1443527:1443889 [1] NCCL INFO Channel 01/0 : 117[46000] -> 116[7000] via P2P/IPC +gpub089:1443527:1443889 [1] NCCL INFO Connected all trees +gpub089:1443527:1443889 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub089:1443527:1443889 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub089:1443527:1443889 [1] NCCL INFO comm 0x8c6983c0 rank 117 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub009:1313741:1313799 [2] NCCL INFO Connected all trees +gpub009:1313741:1313799 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub009:1313741:1313799 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub009:1313741:1313799 [2] NCCL INFO comm 0x50fe65e0 rank 6 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub039:1773548:1773612 [3] NCCL INFO Connected all trees +gpub039:1773548:1773612 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub039:1773548:1773612 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub039:1773548:1773612 [3] NCCL INFO comm 0x8ad50680 rank 55 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub067:1289109:1289109 [2] NCCL INFO cudaDriverVersion 12010 +gpub067:1289109:1289109 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1289109:1289109 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1289109:1289166 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1289109:1289166 [2] NCCL INFO Using network IB +gpub067:1289109:1289166 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub067:1289109:1289166 [2] NCCL INFO Trees [0] 91/-1/-1->90->89 [1] 91/-1/-1->90->89 +gpub067:1289109:1289166 [2] NCCL INFO Channel 00/0 : 90[85000] -> 91[c7000] via P2P/IPC +gpub067:1289109:1289166 [2] NCCL INFO Channel 01/0 : 90[85000] -> 91[c7000] via P2P/IPC +gpub067:1289109:1289166 [2] NCCL INFO Connected all rings +gpub067:1289109:1289166 [2] NCCL INFO Channel 00/0 : 90[85000] -> 89[46000] via P2P/IPC +gpub067:1289109:1289166 [2] NCCL INFO Channel 01/0 : 90[85000] -> 89[46000] via P2P/IPC +gpub014:1242933:1242933 [3] NCCL INFO cudaDriverVersion 12010 +gpub014:1242933:1242933 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.114<0> +gpub014:1242933:1242933 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub014:1242933:1242997 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.114<0> +gpub014:1242933:1242997 [3] NCCL INFO Using network IB +gpub014:1242933:1242997 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub014:1242933:1242997 [3] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] -1/-1/-1->23->22 +gpub014:1242933:1242997 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0 +gpub014:1242933:1242997 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0 +gpub014:1242933:1242997 [3] NCCL INFO Connected all rings +gpub014:1242933:1242997 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 22[85000] via P2P/IPC +gpub014:1242933:1242997 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 22[85000] via P2P/IPC +gpub064:1376672:1376672 [3] NCCL INFO cudaDriverVersion 12010 +gpub064:1376672:1376672 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.164<0> +gpub064:1376672:1376672 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub064:1376672:1376735 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.164<0> +gpub064:1376672:1376735 [3] NCCL INFO Using network IB +gpub064:1376672:1376735 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub064:1376672:1376735 [3] NCCL INFO Trees [0] -1/-1/-1->79->78 [1] -1/-1/-1->79->78 +gpub064:1376672:1376735 [3] NCCL INFO Channel 00/0 : 79[c7000] -> 80[7000] [send] via NET/IB/0 +gpub064:1376672:1376735 [3] NCCL INFO Channel 01/0 : 79[c7000] -> 80[7000] [send] via NET/IB/0 +gpub064:1376672:1376735 [3] NCCL INFO Connected all rings +gpub064:1376672:1376735 [3] NCCL INFO Channel 00/0 : 79[c7000] -> 78[85000] via P2P/IPC +gpub064:1376672:1376735 [3] NCCL INFO Channel 01/0 : 79[c7000] -> 78[85000] via P2P/IPC +gpub037:1358539:1358600 [1] NCCL INFO Channel 00/0 : 45[46000] -> 44[7000] via P2P/IPC +gpub037:1358539:1358600 [1] NCCL INFO Channel 01/0 : 45[46000] -> 44[7000] via P2P/IPC +gpub037:1358539:1358600 [1] NCCL INFO Connected all trees +gpub037:1358539:1358600 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub037:1358539:1358600 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1358539:1358600 [1] NCCL INFO comm 0x4f6a3790 rank 45 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub032:2709151:2709151 [3] NCCL INFO cudaDriverVersion 12010 +gpub032:2709151:2709151 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:2709151:2709151 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:2709151:2709214 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:2709151:2709214 [3] NCCL INFO Using network IB +gpub032:2709151:2709214 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub032:2709151:2709214 [3] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] -1/-1/-1->39->38 +gpub032:2709151:2709214 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0 +gpub032:2709151:2709214 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0 +gpub032:2709151:2709214 [3] NCCL INFO Connected all rings +gpub032:2709151:2709214 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 38[85000] via P2P/IPC +gpub032:2709151:2709214 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 38[85000] via P2P/IPC +gpub089:1443528:1443528 [2] NCCL INFO cudaDriverVersion 12010 +gpub089:1443528:1443528 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.189<0> +gpub089:1443528:1443528 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub089:1443528:1443888 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.189<0> +gpub089:1443528:1443888 [2] NCCL INFO Using network IB +gpub089:1443528:1443888 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub089:1443528:1443888 [2] NCCL INFO Trees [0] 119/-1/-1->118->117 [1] 119/-1/-1->118->117 +gpub089:1443528:1443888 [2] NCCL INFO Channel 00/0 : 118[85000] -> 119[c7000] via P2P/IPC +gpub089:1443528:1443888 [2] NCCL INFO Channel 01/0 : 118[85000] -> 119[c7000] via P2P/IPC +gpub089:1443528:1443888 [2] NCCL INFO Connected all rings +gpub089:1443528:1443888 [2] NCCL INFO Channel 00/0 : 118[85000] -> 117[46000] via P2P/IPC +gpub089:1443528:1443888 [2] NCCL INFO Channel 01/0 : 118[85000] -> 117[46000] via P2P/IPC +gpub039:1773545:1773545 [0] NCCL INFO cudaDriverVersion 12010 +gpub039:1773545:1773545 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.139<0> +gpub039:1773545:1773545 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub039:1773545:1773610 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.139<0> +gpub039:1773545:1773610 [0] NCCL INFO Using network IB +gpub039:1773545:1773610 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub039:1773545:1773610 [0] NCCL INFO Trees [0] 53/-1/-1->52->57 [1] 53/48/-1->52->45 +gpub039:1773545:1773610 [0] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 00/0 : 52[7000] -> 53[46000] via P2P/IPC +gpub039:1773545:1773610 [0] NCCL INFO Channel 01/0 : 52[7000] -> 53[46000] via P2P/IPC +gpub039:1773545:1773610 [0] NCCL INFO Connected all rings +gpub067:1289109:1289166 [2] NCCL INFO Connected all trees +gpub067:1289109:1289166 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub067:1289109:1289166 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1289109:1289166 [2] NCCL INFO comm 0x4f921a60 rank 90 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub014:1242933:1242997 [3] NCCL INFO Connected all trees +gpub014:1242933:1242997 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub014:1242933:1242997 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub014:1242933:1242997 [3] NCCL INFO comm 0x9eb06d0 rank 23 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub064:1376672:1376735 [3] NCCL INFO Connected all trees +gpub064:1376672:1376735 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub064:1376672:1376735 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub064:1376672:1376735 [3] NCCL INFO comm 0xc156a340 rank 79 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub037:1358538:1358538 [0] NCCL INFO cudaDriverVersion 12010 +gpub037:1358538:1358538 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.137<0> +gpub037:1358538:1358538 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub037:1358538:1358599 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.137<0> +gpub037:1358538:1358599 [0] NCCL INFO Using network IB +gpub037:1358538:1358599 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub037:1358538:1358599 [0] NCCL INFO Trees [0] 45/-1/-1->44->40 [1] 45/36/-1->44->29 +gpub037:1358538:1358599 [0] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 00/0 : 44[7000] -> 45[46000] via P2P/IPC +gpub037:1358538:1358599 [0] NCCL INFO Channel 01/0 : 44[7000] -> 45[46000] via P2P/IPC +gpub037:1358538:1358599 [0] NCCL INFO Connected all rings +gpub032:2709151:2709214 [3] NCCL INFO Connected all trees +gpub032:2709151:2709214 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub032:2709151:2709214 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:2709151:2709214 [3] NCCL INFO comm 0x514b7d40 rank 39 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub089:1443528:1443888 [2] NCCL INFO Connected all trees +gpub089:1443528:1443888 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub089:1443528:1443888 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub089:1443528:1443888 [2] NCCL INFO comm 0xb9b05cd0 rank 118 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub039:1773545:1773610 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [receive] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [send] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [receive] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [send] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [receive] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [send] via NET/IB/0 +gpub039:1773545:1773610 [0] NCCL INFO Connected all trees +gpub039:1773545:1773610 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub039:1773545:1773610 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub039:1773545:1773610 [0] NCCL INFO comm 0x5171f660 rank 52 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub067:1289108:1289108 [1] NCCL INFO cudaDriverVersion 12010 +gpub067:1289108:1289108 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.167<0> +gpub067:1289108:1289108 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub067:1289108:1289168 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.167<0> +gpub067:1289108:1289168 [1] NCCL INFO Using network IB +gpub067:1289108:1289168 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub067:1289108:1289168 [1] NCCL INFO Trees [0] 90/84/-1->89->88 [1] 90/-1/-1->89->88 +gpub067:1289108:1289168 [1] NCCL INFO Channel 00/0 : 89[46000] -> 90[85000] via P2P/IPC +gpub067:1289108:1289168 [1] NCCL INFO Channel 01/0 : 89[46000] -> 90[85000] via P2P/IPC +gpub067:1289108:1289168 [1] NCCL INFO Connected all rings +gpub067:1289108:1289168 [1] NCCL INFO Channel 00/0 : 84[7000] -> 89[46000] [receive] via NET/IB/0 +gpub067:1289108:1289168 [1] NCCL INFO Channel 00/0 : 89[46000] -> 84[7000] [send] via NET/IB/0 +gpub064:1376669:1376669 [0] NCCL INFO cudaDriverVersion 12010 +gpub064:1376669:1376669 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.164<0> +gpub064:1376669:1376669 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub064:1376669:1376734 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.164<0> +gpub064:1376669:1376734 [0] NCCL INFO Using network IB +gpub064:1376669:1376734 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub064:1376669:1376734 [0] NCCL INFO Trees [0] 77/-1/-1->76->72 [1] 77/68/-1->76->92 +gpub064:1376669:1376734 [0] NCCL INFO Channel 00/0 : 75[c7000] -> 76[7000] [receive] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 01/0 : 75[c7000] -> 76[7000] [receive] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 00/0 : 76[7000] -> 77[46000] via P2P/IPC +gpub064:1376669:1376734 [0] NCCL INFO Channel 01/0 : 76[7000] -> 77[46000] via P2P/IPC +gpub064:1376669:1376734 [0] NCCL INFO Connected all rings +gpub037:1358538:1358599 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [receive] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [receive] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [receive] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [send] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [send] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [send] via NET/IB/0 +gpub037:1358538:1358599 [0] NCCL INFO Connected all trees +gpub037:1358538:1358599 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub037:1358538:1358599 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub037:1358538:1358599 [0] NCCL INFO comm 0x4fcb02a0 rank 44 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub089:1443529:1443529 [3] NCCL INFO cudaDriverVersion 12010 +gpub089:1443529:1443529 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.189<0> +gpub089:1443529:1443529 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub089:1443529:1443887 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.189<0> +gpub089:1443529:1443887 [3] NCCL INFO Using network IB +gpub089:1443529:1443887 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub089:1443529:1443887 [3] NCCL INFO Trees [0] -1/-1/-1->119->118 [1] -1/-1/-1->119->118 +gpub089:1443529:1443887 [3] NCCL INFO Channel 00/0 : 119[c7000] -> 120[7000] [send] via NET/IB/0 +gpub089:1443529:1443887 [3] NCCL INFO Channel 01/0 : 119[c7000] -> 120[7000] [send] via NET/IB/0 +gpub089:1443529:1443887 [3] NCCL INFO Connected all rings +gpub089:1443529:1443887 [3] NCCL INFO Channel 00/0 : 119[c7000] -> 118[85000] via P2P/IPC +gpub089:1443529:1443887 [3] NCCL INFO Channel 01/0 : 119[c7000] -> 118[85000] via P2P/IPC +gpub067:1289108:1289168 [1] NCCL INFO Channel 00/0 : 89[46000] -> 88[7000] via P2P/IPC +gpub067:1289108:1289168 [1] NCCL INFO Channel 01/0 : 89[46000] -> 88[7000] via P2P/IPC +gpub067:1289108:1289168 [1] NCCL INFO Connected all trees +gpub067:1289108:1289168 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub067:1289108:1289168 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub067:1289108:1289168 [1] NCCL INFO comm 0x519a0210 rank 89 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub064:1376669:1376734 [0] NCCL INFO Channel 00/0 : 72[7000] -> 76[7000] [receive] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 01/0 : 68[7000] -> 76[7000] [receive] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 01/0 : 76[7000] -> 92[7000] [send] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 01/0 : 92[7000] -> 76[7000] [receive] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 01/0 : 76[7000] -> 68[7000] [send] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Channel 00/0 : 76[7000] -> 72[7000] [send] via NET/IB/0 +gpub064:1376669:1376734 [0] NCCL INFO Connected all trees +gpub064:1376669:1376734 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub064:1376669:1376734 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub064:1376669:1376734 [0] NCCL INFO comm 0x8d2203f0 rank 76 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub089:1443529:1443887 [3] NCCL INFO Connected all trees +gpub089:1443529:1443887 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub089:1443529:1443887 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub089:1443529:1443887 [3] NCCL INFO comm 0x8b1a64f0 rank 119 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub009:1313742:1313742 [3] NCCL INFO cudaDriverVersion 12010 +gpub009:1313742:1313742 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.109<0> +gpub009:1313742:1313742 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub009:1313742:1313798 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.109<0> +gpub009:1313742:1313798 [3] NCCL INFO Using network IB +gpub009:1313742:1313798 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub009:1313742:1313798 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +gpub009:1313742:1313798 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0 +gpub009:1313742:1313798 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0 +gpub009:1313742:1313798 [3] NCCL INFO Connected all rings +gpub009:1313742:1313798 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 6[85000] via P2P/IPC +gpub009:1313742:1313798 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 6[85000] via P2P/IPC +gpub009:1313742:1313798 [3] NCCL INFO Connected all trees +gpub009:1313742:1313798 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub009:1313742:1313798 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub009:1313742:1313798 [3] NCCL INFO comm 0x9d5ff680 rank 7 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub032:2709148:2709148 [0] NCCL INFO cudaDriverVersion 12010 +gpub032:2709148:2709148 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.132<0> +gpub032:2709148:2709148 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub032:2709148:2709212 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.132<0> +gpub032:2709148:2709212 [0] NCCL INFO Using network IB +gpub032:2709148:2709212 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub032:2709148:2709212 [0] NCCL INFO Trees [0] 37/-1/-1->36->41 [1] 37/32/-1->36->44 +gpub032:2709148:2709212 [0] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 00/0 : 36[7000] -> 37[46000] via P2P/IPC +gpub032:2709148:2709212 [0] NCCL INFO Channel 01/0 : 36[7000] -> 37[46000] via P2P/IPC +gpub032:2709148:2709212 [0] NCCL INFO Connected all rings +gpub032:2709148:2709212 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [receive] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [send] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [send] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [receive] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [receive] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [send] via NET/IB/0 +gpub032:2709148:2709212 [0] NCCL INFO Connected all trees +gpub032:2709148:2709212 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub032:2709148:2709212 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub032:2709148:2709212 [0] NCCL INFO comm 0x9c01580 rank 36 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub039:1773547:1773547 [2] NCCL INFO cudaDriverVersion 12010 +gpub039:1773547:1773547 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.139<0> +gpub039:1773547:1773547 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub039:1773547:1773611 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.139<0> +gpub039:1773547:1773611 [2] NCCL INFO Using network IB +gpub039:1773547:1773611 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub039:1773547:1773611 [2] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 +gpub039:1773547:1773611 [2] NCCL INFO Channel 00/0 : 54[85000] -> 55[c7000] via P2P/IPC +gpub039:1773547:1773611 [2] NCCL INFO Channel 01/0 : 54[85000] -> 55[c7000] via P2P/IPC +gpub039:1773547:1773611 [2] NCCL INFO Connected all rings +gpub039:1773547:1773611 [2] NCCL INFO Channel 00/0 : 54[85000] -> 53[46000] via P2P/IPC +gpub039:1773547:1773611 [2] NCCL INFO Channel 01/0 : 54[85000] -> 53[46000] via P2P/IPC +gpub039:1773547:1773611 [2] NCCL INFO Connected all trees +gpub039:1773547:1773611 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub039:1773547:1773611 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub039:1773547:1773611 [2] NCCL INFO comm 0x516f33e0 rank 54 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub009:1313740:1313740 [1] NCCL INFO cudaDriverVersion 12010 +gpub009:1313740:1313740 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.109<0> +gpub009:1313740:1313740 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub009:1313740:1313797 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.109<0> +gpub009:1313740:1313797 [1] NCCL INFO Using network IB +gpub009:1313740:1313797 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub009:1313740:1313797 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/8/-1->5->4 +gpub009:1313740:1313797 [1] NCCL INFO Channel 00/0 : 5[46000] -> 6[85000] via P2P/IPC +gpub009:1313740:1313797 [1] NCCL INFO Channel 01/0 : 5[46000] -> 6[85000] via P2P/IPC +gpub009:1313740:1313797 [1] NCCL INFO Connected all rings +gpub009:1313740:1313797 [1] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [send] via NET/IB/0 +gpub009:1313740:1313797 [1] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [receive] via NET/IB/0 +gpub009:1313740:1313797 [1] NCCL INFO Channel 00/0 : 5[46000] -> 4[7000] via P2P/IPC +gpub009:1313740:1313797 [1] NCCL INFO Channel 01/0 : 5[46000] -> 4[7000] via P2P/IPC +gpub009:1313740:1313797 [1] NCCL INFO Connected all trees +gpub009:1313740:1313797 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub009:1313740:1313797 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub009:1313740:1313797 [1] NCCL INFO comm 0x51a376d0 rank 5 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub014:1242931:1242931 [1] NCCL INFO cudaDriverVersion 12010 +gpub014:1242931:1242931 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.114<0> +gpub014:1242931:1242931 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub014:1242931:1242995 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.114<0> +gpub014:1242931:1242995 [1] NCCL INFO Using network IB +gpub014:1242931:1242995 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub014:1242931:1242995 [1] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/24/-1->21->20 +gpub014:1242931:1242995 [1] NCCL INFO Channel 00/0 : 21[46000] -> 22[85000] via P2P/IPC +gpub014:1242931:1242995 [1] NCCL INFO Channel 01/0 : 21[46000] -> 22[85000] via P2P/IPC +gpub014:1242931:1242995 [1] NCCL INFO Connected all rings +gpub014:1242931:1242995 [1] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [send] via NET/IB/0 +gpub014:1242931:1242995 [1] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [receive] via NET/IB/0 +gpub014:1242931:1242995 [1] NCCL INFO Channel 00/0 : 21[46000] -> 20[7000] via P2P/IPC +gpub014:1242931:1242995 [1] NCCL INFO Channel 01/0 : 21[46000] -> 20[7000] via P2P/IPC +gpub014:1242931:1242995 [1] NCCL INFO Connected all trees +gpub014:1242931:1242995 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub014:1242931:1242995 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub014:1242931:1242995 [1] NCCL INFO comm 0xa3cf110 rank 21 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub012:1262645:1262645 [1] NCCL INFO cudaDriverVersion 12010 +gpub012:1262645:1262645 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.112<0> +gpub012:1262645:1262645 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub012:1262645:1262707 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.112<0> +gpub012:1262645:1262707 [1] NCCL INFO Using network IB +gpub012:1262645:1262707 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub012:1262645:1262707 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/20/-1->13->12 +gpub012:1262645:1262707 [1] NCCL INFO Channel 00/0 : 13[46000] -> 14[85000] via P2P/IPC +gpub012:1262645:1262707 [1] NCCL INFO Channel 01/0 : 13[46000] -> 14[85000] via P2P/IPC +gpub012:1262645:1262707 [1] NCCL INFO Connected all rings +gpub012:1262645:1262707 [1] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [send] via NET/IB/0 +gpub012:1262645:1262707 [1] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [receive] via NET/IB/0 +gpub012:1262645:1262707 [1] NCCL INFO Channel 00/0 : 13[46000] -> 12[7000] via P2P/IPC +gpub012:1262645:1262707 [1] NCCL INFO Channel 01/0 : 13[46000] -> 12[7000] via P2P/IPC +gpub012:1262645:1262707 [1] NCCL INFO Connected all trees +gpub012:1262645:1262707 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub012:1262645:1262707 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub012:1262645:1262707 [1] NCCL INFO comm 0x8bc40190 rank 13 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub059:1722912:1722912 [3] NCCL INFO cudaDriverVersion 12010 +gpub059:1722912:1722912 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1722912:1722912 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1722912:1722976 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1722912:1722976 [3] NCCL INFO Using network IB +gpub059:1722912:1722976 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub059:1722912:1722976 [3] NCCL INFO Trees [0] -1/-1/-1->71->70 [1] -1/-1/-1->71->70 +gpub059:1722912:1722976 [3] NCCL INFO Channel 00/0 : 71[c7000] -> 72[7000] [send] via NET/IB/0 +gpub059:1722912:1722976 [3] NCCL INFO Channel 01/0 : 71[c7000] -> 72[7000] [send] via NET/IB/0 +gpub059:1722912:1722976 [3] NCCL INFO Connected all rings +gpub059:1722912:1722976 [3] NCCL INFO Channel 00/0 : 71[c7000] -> 70[85000] via P2P/IPC +gpub059:1722912:1722976 [3] NCCL INFO Channel 01/0 : 71[c7000] -> 70[85000] via P2P/IPC +gpub059:1722912:1722976 [3] NCCL INFO Connected all trees +gpub059:1722912:1722976 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub059:1722912:1722976 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1722912:1722976 [3] NCCL INFO comm 0x50e6bd70 rank 71 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub059:1722910:1722910 [1] NCCL INFO cudaDriverVersion 12010 +gpub059:1722910:1722910 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1722910:1722910 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1722910:1722978 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1722910:1722978 [1] NCCL INFO Using network IB +gpub059:1722910:1722978 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub059:1722910:1722978 [1] NCCL INFO Trees [0] 70/-1/-1->69->68 [1] 70/72/-1->69->68 +gpub059:1722910:1722978 [1] NCCL INFO Channel 00/0 : 69[46000] -> 70[85000] via P2P/IPC +gpub059:1722910:1722978 [1] NCCL INFO Channel 01/0 : 69[46000] -> 70[85000] via P2P/IPC +gpub059:1722910:1722978 [1] NCCL INFO Connected all rings +gpub059:1722910:1722978 [1] NCCL INFO Channel 01/0 : 69[46000] -> 72[7000] [send] via NET/IB/0 +gpub059:1722910:1722978 [1] NCCL INFO Channel 01/0 : 72[7000] -> 69[46000] [receive] via NET/IB/0 +gpub059:1722910:1722978 [1] NCCL INFO Channel 00/0 : 69[46000] -> 68[7000] via P2P/IPC +gpub059:1722910:1722978 [1] NCCL INFO Channel 01/0 : 69[46000] -> 68[7000] via P2P/IPC +gpub059:1722910:1722978 [1] NCCL INFO Connected all trees +gpub059:1722910:1722978 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub059:1722910:1722978 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1722910:1722978 [1] NCCL INFO comm 0x4fd07e80 rank 69 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub059:1722909:1722909 [0] NCCL INFO cudaDriverVersion 12010 +gpub059:1722909:1722909 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.159<0> +gpub059:1722909:1722909 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub059:1722909:1722977 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.159<0> +gpub059:1722909:1722977 [0] NCCL INFO Using network IB +gpub059:1722909:1722977 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub059:1722909:1722977 [0] NCCL INFO Trees [0] 69/-1/-1->68->73 [1] 69/64/-1->68->76 +gpub059:1722909:1722977 [0] NCCL INFO Channel 00/0 : 67[c7000] -> 68[7000] [receive] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 01/0 : 67[c7000] -> 68[7000] [receive] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 00/0 : 68[7000] -> 69[46000] via P2P/IPC +gpub059:1722909:1722977 [0] NCCL INFO Channel 01/0 : 68[7000] -> 69[46000] via P2P/IPC +gpub059:1722909:1722977 [0] NCCL INFO Connected all rings +gpub059:1722909:1722977 [0] NCCL INFO Channel 01/0 : 64[7000] -> 68[7000] [receive] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 00/0 : 68[7000] -> 73[46000] [send] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 01/0 : 68[7000] -> 76[7000] [send] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 01/0 : 76[7000] -> 68[7000] [receive] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 00/0 : 73[46000] -> 68[7000] [receive] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Channel 01/0 : 68[7000] -> 64[7000] [send] via NET/IB/0 +gpub059:1722909:1722977 [0] NCCL INFO Connected all trees +gpub059:1722909:1722977 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub059:1722909:1722977 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub059:1722909:1722977 [0] NCCL INFO comm 0x4f5becc0 rank 68 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub088:1265112:1265112 [2] NCCL INFO cudaDriverVersion 12010 +gpub088:1265112:1265112 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.188<0> +gpub088:1265112:1265112 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub088:1265112:1265182 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.188<0> +gpub088:1265112:1265182 [2] NCCL INFO Using network IB +gpub088:1265112:1265182 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub088:1265112:1265182 [2] NCCL INFO Trees [0] 115/-1/-1->114->113 [1] 115/-1/-1->114->113 +gpub088:1265112:1265182 [2] NCCL INFO Channel 00/0 : 114[85000] -> 115[c7000] via P2P/IPC +gpub088:1265112:1265182 [2] NCCL INFO Channel 01/0 : 114[85000] -> 115[c7000] via P2P/IPC +gpub088:1265112:1265182 [2] NCCL INFO Connected all rings +gpub088:1265112:1265182 [2] NCCL INFO Channel 00/0 : 114[85000] -> 113[46000] via P2P/IPC +gpub088:1265112:1265182 [2] NCCL INFO Channel 01/0 : 114[85000] -> 113[46000] via P2P/IPC +gpub088:1265112:1265182 [2] NCCL INFO Connected all trees +gpub088:1265112:1265182 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub088:1265112:1265182 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub088:1265112:1265182 [2] NCCL INFO comm 0x8c4ab250 rank 114 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub088:1265110:1265110 [0] NCCL INFO cudaDriverVersion 12010 +gpub088:1265110:1265110 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.188<0> +gpub088:1265110:1265110 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub088:1265110:1265183 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.188<0> +gpub088:1265110:1265183 [0] NCCL INFO Using network IB +gpub088:1265110:1265183 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub088:1265110:1265183 [0] NCCL INFO Trees [0] 113/120/-1->112->96 [1] 113/-1/-1->112->116 +gpub088:1265110:1265183 [0] NCCL INFO Channel 00/0 : 111[c7000] -> 112[7000] [receive] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 01/0 : 111[c7000] -> 112[7000] [receive] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 00/0 : 112[7000] -> 113[46000] via P2P/IPC +gpub088:1265110:1265183 [0] NCCL INFO Channel 01/0 : 112[7000] -> 113[46000] via P2P/IPC +gpub088:1265110:1265183 [0] NCCL INFO Connected all rings +gpub088:1265110:1265183 [0] NCCL INFO Channel 01/0 : 112[7000] -> 116[7000] [send] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 00/0 : 112[7000] -> 120[7000] [send] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 00/0 : 96[7000] -> 112[7000] [receive] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 00/0 : 112[7000] -> 96[7000] [send] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 00/0 : 120[7000] -> 112[7000] [receive] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Channel 01/0 : 116[7000] -> 112[7000] [receive] via NET/IB/0 +gpub088:1265110:1265183 [0] NCCL INFO Connected all trees +gpub088:1265110:1265183 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub088:1265110:1265183 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub088:1265110:1265183 [0] NCCL INFO comm 0xa9d75ce0 rank 112 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub038:1419973:1419973 [3] NCCL INFO cudaDriverVersion 12010 +gpub038:1419973:1419973 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.138<0> +gpub038:1419973:1419973 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub038:1419973:1420037 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.138<0> +gpub038:1419973:1420037 [3] NCCL INFO Using network IB +gpub038:1419973:1420037 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub038:1419973:1420037 [3] NCCL INFO Trees [0] -1/-1/-1->51->50 [1] -1/-1/-1->51->50 +gpub038:1419973:1420037 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0 +gpub038:1419973:1420037 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0 +gpub038:1419973:1420037 [3] NCCL INFO Connected all rings +gpub038:1419973:1420037 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 50[85000] via P2P/IPC +gpub038:1419973:1420037 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 50[85000] via P2P/IPC +gpub038:1419973:1420037 [3] NCCL INFO Connected all trees +gpub038:1419973:1420037 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub038:1419973:1420037 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub038:1419973:1420037 [3] NCCL INFO comm 0x93398f0 rank 51 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub038:1419971:1419971 [1] NCCL INFO cudaDriverVersion 12010 +gpub038:1419971:1419971 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.138<0> +gpub038:1419971:1419971 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub038:1419971:1420034 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.138<0> +gpub038:1419971:1420034 [1] NCCL INFO Using network IB +gpub038:1419971:1420034 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub038:1419971:1420034 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] 50/-1/-1->49->48 +gpub038:1419971:1420034 [1] NCCL INFO Channel 00/0 : 49[46000] -> 50[85000] via P2P/IPC +gpub038:1419971:1420034 [1] NCCL INFO Channel 01/0 : 49[46000] -> 50[85000] via P2P/IPC +gpub038:1419971:1420034 [1] NCCL INFO Connected all rings +gpub038:1419971:1420034 [1] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [receive] via NET/IB/0 +gpub038:1419971:1420034 [1] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [send] via NET/IB/0 +gpub038:1419971:1420034 [1] NCCL INFO Channel 00/0 : 49[46000] -> 48[7000] via P2P/IPC +gpub038:1419971:1420034 [1] NCCL INFO Channel 01/0 : 49[46000] -> 48[7000] via P2P/IPC +gpub038:1419971:1420034 [1] NCCL INFO Connected all trees +gpub038:1419971:1420034 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub038:1419971:1420034 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub038:1419971:1420034 [1] NCCL INFO comm 0xb5e5ede0 rank 49 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub038:1419972:1419972 [2] NCCL INFO cudaDriverVersion 12010 +gpub038:1419972:1419972 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.138<0> +gpub038:1419972:1419972 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub038:1419972:1420036 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.138<0> +gpub038:1419972:1420036 [2] NCCL INFO Using network IB +gpub038:1419972:1420036 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub038:1419972:1420036 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/-1/-1->50->49 +gpub038:1419972:1420036 [2] NCCL INFO Channel 00/0 : 50[85000] -> 51[c7000] via P2P/IPC +gpub038:1419972:1420036 [2] NCCL INFO Channel 01/0 : 50[85000] -> 51[c7000] via P2P/IPC +gpub038:1419972:1420036 [2] NCCL INFO Connected all rings +gpub038:1419972:1420036 [2] NCCL INFO Channel 00/0 : 50[85000] -> 49[46000] via P2P/IPC +gpub038:1419972:1420036 [2] NCCL INFO Channel 01/0 : 50[85000] -> 49[46000] via P2P/IPC +gpub038:1419972:1420036 [2] NCCL INFO Connected all trees +gpub038:1419972:1420036 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub038:1419972:1420036 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub038:1419972:1420036 [2] NCCL INFO comm 0x94b05e40 rank 50 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub088:1265111:1265111 [1] NCCL INFO cudaDriverVersion 12010 +gpub088:1265111:1265111 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.188<0> +gpub088:1265111:1265111 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub088:1265111:1265184 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.188<0> +gpub088:1265111:1265184 [1] NCCL INFO Using network IB +gpub088:1265111:1265184 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub088:1265111:1265184 [1] NCCL INFO Trees [0] 114/104/-1->113->112 [1] 114/-1/-1->113->112 +gpub088:1265111:1265184 [1] NCCL INFO Channel 00/0 : 113[46000] -> 114[85000] via P2P/IPC +gpub088:1265111:1265184 [1] NCCL INFO Channel 01/0 : 113[46000] -> 114[85000] via P2P/IPC +gpub088:1265111:1265184 [1] NCCL INFO Connected all rings +gpub088:1265111:1265184 [1] NCCL INFO Channel 00/0 : 104[7000] -> 113[46000] [receive] via NET/IB/0 +gpub088:1265111:1265184 [1] NCCL INFO Channel 00/0 : 113[46000] -> 104[7000] [send] via NET/IB/0 +gpub088:1265111:1265184 [1] NCCL INFO Channel 00/0 : 113[46000] -> 112[7000] via P2P/IPC +gpub088:1265111:1265184 [1] NCCL INFO Channel 01/0 : 113[46000] -> 112[7000] via P2P/IPC +gpub088:1265111:1265184 [1] NCCL INFO Connected all trees +gpub088:1265111:1265184 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub088:1265111:1265184 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub088:1265111:1265184 [1] NCCL INFO comm 0xa8f2e890 rank 113 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub088:1265113:1265113 [3] NCCL INFO cudaDriverVersion 12010 +gpub088:1265113:1265113 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.188<0> +gpub088:1265113:1265113 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub088:1265113:1265181 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.188<0> +gpub088:1265113:1265181 [3] NCCL INFO Using network IB +gpub088:1265113:1265181 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub088:1265113:1265181 [3] NCCL INFO Trees [0] -1/-1/-1->115->114 [1] -1/-1/-1->115->114 +gpub088:1265113:1265181 [3] NCCL INFO Channel 00/0 : 115[c7000] -> 116[7000] [send] via NET/IB/0 +gpub088:1265113:1265181 [3] NCCL INFO Channel 01/0 : 115[c7000] -> 116[7000] [send] via NET/IB/0 +gpub088:1265113:1265181 [3] NCCL INFO Connected all rings +gpub088:1265113:1265181 [3] NCCL INFO Channel 00/0 : 115[c7000] -> 114[85000] via P2P/IPC +gpub088:1265113:1265181 [3] NCCL INFO Channel 01/0 : 115[c7000] -> 114[85000] via P2P/IPC +gpub088:1265113:1265181 [3] NCCL INFO Connected all trees +gpub088:1265113:1265181 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub088:1265113:1265181 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub088:1265113:1265181 [3] NCCL INFO comm 0xb90f03c0 rank 115 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub038:1419970:1419970 [0] NCCL INFO cudaDriverVersion 12010 +gpub038:1419970:1419970 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.138<0> +gpub038:1419970:1419970 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub038:1419970:1420035 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.138<0> +gpub038:1419970:1420035 [0] NCCL INFO Using network IB +gpub038:1419970:1420035 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub038:1419970:1420035 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->52 +gpub038:1419970:1420035 [0] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 00/0 : 48[7000] -> 49[46000] via P2P/IPC +gpub038:1419970:1420035 [0] NCCL INFO Channel 01/0 : 48[7000] -> 49[46000] via P2P/IPC +gpub038:1419970:1420035 [0] NCCL INFO Connected all rings +gpub038:1419970:1420035 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [send] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [send] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [receive] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [send] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [receive] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [receive] via NET/IB/0 +gpub038:1419970:1420035 [0] NCCL INFO Connected all trees +gpub038:1419970:1420035 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub038:1419970:1420035 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub038:1419970:1420035 [0] NCCL INFO comm 0x8b623c00 rank 48 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub083:326512:326512 [3] NCCL INFO cudaDriverVersion 12010 +gpub083:326512:326512 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.183<0> +gpub083:326512:326512 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub083:326512:326570 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.183<0> +gpub083:326512:326570 [3] NCCL INFO Using network IB +gpub083:326512:326570 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub083:326512:326570 [3] NCCL INFO Trees [0] -1/-1/-1->107->106 [1] -1/-1/-1->107->106 +gpub083:326512:326570 [3] NCCL INFO Channel 00/0 : 107[c7000] -> 108[7000] [send] via NET/IB/0 +gpub083:326512:326570 [3] NCCL INFO Channel 01/0 : 107[c7000] -> 108[7000] [send] via NET/IB/0 +gpub083:326512:326570 [3] NCCL INFO Connected all rings +gpub083:326512:326570 [3] NCCL INFO Channel 00/0 : 107[c7000] -> 106[85000] via P2P/IPC +gpub083:326512:326570 [3] NCCL INFO Channel 01/0 : 107[c7000] -> 106[85000] via P2P/IPC +gpub083:326512:326570 [3] NCCL INFO Connected all trees +gpub083:326512:326570 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub083:326512:326570 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub083:326512:326570 [3] NCCL INFO comm 0x50bb38f0 rank 107 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub083:326511:326511 [2] NCCL INFO cudaDriverVersion 12010 +gpub083:326511:326511 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.183<0> +gpub083:326511:326511 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub083:326511:326571 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.183<0> +gpub083:326511:326571 [2] NCCL INFO Using network IB +gpub083:326511:326571 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub083:326511:326571 [2] NCCL INFO Trees [0] 107/-1/-1->106->105 [1] 107/-1/-1->106->105 +gpub083:326511:326571 [2] NCCL INFO Channel 00/0 : 106[85000] -> 107[c7000] via P2P/IPC +gpub083:326511:326571 [2] NCCL INFO Channel 01/0 : 106[85000] -> 107[c7000] via P2P/IPC +gpub083:326511:326571 [2] NCCL INFO Connected all rings +gpub083:326511:326571 [2] NCCL INFO Channel 00/0 : 106[85000] -> 105[46000] via P2P/IPC +gpub083:326511:326571 [2] NCCL INFO Channel 01/0 : 106[85000] -> 105[46000] via P2P/IPC +gpub083:326511:326571 [2] NCCL INFO Connected all trees +gpub083:326511:326571 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub083:326511:326571 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub083:326511:326571 [2] NCCL INFO comm 0x8cf83ae0 rank 106 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub083:326509:326509 [0] NCCL INFO cudaDriverVersion 12010 +gpub083:326509:326509 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.183<0> +gpub083:326509:326509 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub083:326509:326573 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.183<0> +gpub083:326509:326573 [0] NCCL INFO Using network IB +gpub083:326509:326573 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub083:326509:326573 [0] NCCL INFO Trees [0] 105/108/-1->104->113 [1] 105/-1/-1->104->101 +gpub083:326509:326573 [0] NCCL INFO Channel 00/0 : 103[c7000] -> 104[7000] [receive] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 01/0 : 103[c7000] -> 104[7000] [receive] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 00/0 : 104[7000] -> 105[46000] via P2P/IPC +gpub083:326509:326573 [0] NCCL INFO Channel 01/0 : 104[7000] -> 105[46000] via P2P/IPC +gpub083:326509:326573 [0] NCCL INFO Connected all rings +gpub083:326509:326573 [0] NCCL INFO Channel 01/0 : 101[46000] -> 104[7000] [receive] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 00/0 : 104[7000] -> 108[7000] [send] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 00/0 : 104[7000] -> 113[46000] [send] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 00/0 : 113[46000] -> 104[7000] [receive] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 00/0 : 108[7000] -> 104[7000] [receive] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Channel 01/0 : 104[7000] -> 101[46000] [send] via NET/IB/0 +gpub083:326509:326573 [0] NCCL INFO Connected all trees +gpub083:326509:326573 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub083:326509:326573 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub083:326509:326573 [0] NCCL INFO comm 0x50391690 rank 104 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub013:1454152:1454152 [0] NCCL INFO cudaDriverVersion 12010 +gpub013:1454152:1454152 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.113<0> +gpub013:1454152:1454152 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub013:1454152:1454217 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.113<0> +gpub013:1454152:1454217 [0] NCCL INFO Using network IB +gpub013:1454152:1454217 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub013:1454152:1454217 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->20 +gpub013:1454152:1454217 [0] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 00/0 : 16[7000] -> 17[46000] via P2P/IPC +gpub013:1454152:1454217 [0] NCCL INFO Channel 01/0 : 16[7000] -> 17[46000] via P2P/IPC +gpub013:1454152:1454217 [0] NCCL INFO Connected all rings +gpub013:1454152:1454217 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [send] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [send] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [send] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [receive] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [receive] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [receive] via NET/IB/0 +gpub013:1454152:1454217 [0] NCCL INFO Connected all trees +gpub013:1454152:1454217 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub013:1454152:1454217 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub013:1454152:1454217 [0] NCCL INFO comm 0xbf779d50 rank 16 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub013:1454154:1454154 [2] NCCL INFO cudaDriverVersion 12010 +gpub013:1454154:1454154 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.113<0> +gpub013:1454154:1454154 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub013:1454154:1454216 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.113<0> +gpub013:1454154:1454216 [2] NCCL INFO Using network IB +gpub013:1454154:1454216 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub013:1454154:1454216 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/-1/-1->18->17 +gpub013:1454154:1454216 [2] NCCL INFO Channel 00/0 : 18[85000] -> 19[c7000] via P2P/IPC +gpub013:1454154:1454216 [2] NCCL INFO Channel 01/0 : 18[85000] -> 19[c7000] via P2P/IPC +gpub013:1454154:1454216 [2] NCCL INFO Connected all rings +gpub013:1454154:1454216 [2] NCCL INFO Channel 00/0 : 18[85000] -> 17[46000] via P2P/IPC +gpub013:1454154:1454216 [2] NCCL INFO Channel 01/0 : 18[85000] -> 17[46000] via P2P/IPC +gpub013:1454154:1454216 [2] NCCL INFO Connected all trees +gpub013:1454154:1454216 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub013:1454154:1454216 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub013:1454154:1454216 [2] NCCL INFO comm 0x8c14b260 rank 18 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub065:1317230:1317230 [3] NCCL INFO cudaDriverVersion 12010 +gpub065:1317230:1317230 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.165<0> +gpub065:1317230:1317230 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub065:1317230:1317295 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.165<0> +gpub065:1317230:1317295 [3] NCCL INFO Using network IB +gpub065:1317230:1317295 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub065:1317230:1317295 [3] NCCL INFO Trees [0] -1/-1/-1->83->82 [1] -1/-1/-1->83->82 +gpub065:1317230:1317295 [3] NCCL INFO Channel 00/0 : 83[c7000] -> 84[7000] [send] via NET/IB/0 +gpub065:1317230:1317295 [3] NCCL INFO Channel 01/0 : 83[c7000] -> 84[7000] [send] via NET/IB/0 +gpub065:1317230:1317295 [3] NCCL INFO Connected all rings +gpub065:1317230:1317295 [3] NCCL INFO Channel 00/0 : 83[c7000] -> 82[85000] via P2P/IPC +gpub065:1317230:1317295 [3] NCCL INFO Channel 01/0 : 83[c7000] -> 82[85000] via P2P/IPC +gpub065:1317230:1317295 [3] NCCL INFO Connected all trees +gpub065:1317230:1317295 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub065:1317230:1317295 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub065:1317230:1317295 [3] NCCL INFO comm 0x8ebce5d0 rank 83 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub065:1317229:1317229 [2] NCCL INFO cudaDriverVersion 12010 +gpub065:1317229:1317229 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.165<0> +gpub065:1317229:1317229 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub065:1317229:1317296 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.165<0> +gpub065:1317229:1317296 [2] NCCL INFO Using network IB +gpub065:1317229:1317296 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub065:1317229:1317296 [2] NCCL INFO Trees [0] 83/-1/-1->82->81 [1] 83/-1/-1->82->81 +gpub065:1317229:1317296 [2] NCCL INFO Channel 00/0 : 82[85000] -> 83[c7000] via P2P/IPC +gpub065:1317229:1317296 [2] NCCL INFO Channel 01/0 : 82[85000] -> 83[c7000] via P2P/IPC +gpub065:1317229:1317296 [2] NCCL INFO Connected all rings +gpub065:1317229:1317296 [2] NCCL INFO Channel 00/0 : 82[85000] -> 81[46000] via P2P/IPC +gpub065:1317229:1317296 [2] NCCL INFO Channel 01/0 : 82[85000] -> 81[46000] via P2P/IPC +gpub065:1317229:1317296 [2] NCCL INFO Connected all trees +gpub065:1317229:1317296 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub065:1317229:1317296 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub065:1317229:1317296 [2] NCCL INFO comm 0x50c3e4a0 rank 82 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub013:1454155:1454155 [3] NCCL INFO cudaDriverVersion 12010 +gpub013:1454155:1454155 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.113<0> +gpub013:1454155:1454155 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub013:1454155:1454218 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.113<0> +gpub013:1454155:1454218 [3] NCCL INFO Using network IB +gpub013:1454155:1454218 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub013:1454155:1454218 [3] NCCL INFO Trees [0] -1/-1/-1->19->18 [1] -1/-1/-1->19->18 +gpub013:1454155:1454218 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0 +gpub013:1454155:1454218 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0 +gpub013:1454155:1454218 [3] NCCL INFO Connected all rings +gpub013:1454155:1454218 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 18[85000] via P2P/IPC +gpub013:1454155:1454218 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 18[85000] via P2P/IPC +gpub013:1454155:1454218 [3] NCCL INFO Connected all trees +gpub013:1454155:1454218 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub013:1454155:1454218 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub013:1454155:1454218 [3] NCCL INFO comm 0xaba06b70 rank 19 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub083:326510:326510 [1] NCCL INFO cudaDriverVersion 12010 +gpub083:326510:326510 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.183<0> +gpub083:326510:326510 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub083:326510:326572 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.183<0> +gpub083:326510:326572 [1] NCCL INFO Using network IB +gpub083:326510:326572 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub083:326510:326572 [1] NCCL INFO Trees [0] 106/100/-1->105->104 [1] 106/-1/-1->105->104 +gpub083:326510:326572 [1] NCCL INFO Channel 00/0 : 105[46000] -> 106[85000] via P2P/IPC +gpub083:326510:326572 [1] NCCL INFO Channel 01/0 : 105[46000] -> 106[85000] via P2P/IPC +gpub083:326510:326572 [1] NCCL INFO Connected all rings +gpub083:326510:326572 [1] NCCL INFO Channel 00/0 : 100[7000] -> 105[46000] [receive] via NET/IB/0 +gpub083:326510:326572 [1] NCCL INFO Channel 00/0 : 105[46000] -> 100[7000] [send] via NET/IB/0 +gpub083:326510:326572 [1] NCCL INFO Channel 00/0 : 105[46000] -> 104[7000] via P2P/IPC +gpub083:326510:326572 [1] NCCL INFO Channel 01/0 : 105[46000] -> 104[7000] via P2P/IPC +gpub083:326510:326572 [1] NCCL INFO Connected all trees +gpub083:326510:326572 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub083:326510:326572 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub083:326510:326572 [1] NCCL INFO comm 0xb489cca0 rank 105 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub065:1317228:1317228 [1] NCCL INFO cudaDriverVersion 12010 +gpub065:1317228:1317228 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.165<0> +gpub065:1317228:1317228 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub065:1317228:1317297 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.165<0> +gpub065:1317228:1317297 [1] NCCL INFO Using network IB +gpub065:1317228:1317297 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub065:1317228:1317297 [1] NCCL INFO Trees [0] 82/72/-1->81->80 [1] 82/-1/-1->81->80 +gpub065:1317228:1317297 [1] NCCL INFO Channel 00/0 : 81[46000] -> 82[85000] via P2P/IPC +gpub065:1317228:1317297 [1] NCCL INFO Channel 01/0 : 81[46000] -> 82[85000] via P2P/IPC +gpub065:1317228:1317297 [1] NCCL INFO Connected all rings +gpub065:1317228:1317297 [1] NCCL INFO Channel 00/0 : 72[7000] -> 81[46000] [receive] via NET/IB/0 +gpub065:1317228:1317297 [1] NCCL INFO Channel 00/0 : 81[46000] -> 72[7000] [send] via NET/IB/0 +gpub065:1317228:1317297 [1] NCCL INFO Channel 00/0 : 81[46000] -> 80[7000] via P2P/IPC +gpub065:1317228:1317297 [1] NCCL INFO Channel 01/0 : 81[46000] -> 80[7000] via P2P/IPC +gpub065:1317228:1317297 [1] NCCL INFO Connected all trees +gpub065:1317228:1317297 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub065:1317228:1317297 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub065:1317228:1317297 [1] NCCL INFO comm 0x9cb3f50 rank 81 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub013:1454153:1454153 [1] NCCL INFO cudaDriverVersion 12010 +gpub013:1454153:1454153 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.113<0> +gpub013:1454153:1454153 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub013:1454153:1454219 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.113<0> +gpub013:1454153:1454219 [1] NCCL INFO Using network IB +gpub013:1454153:1454219 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub013:1454153:1454219 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] 18/-1/-1->17->16 +gpub013:1454153:1454219 [1] NCCL INFO Channel 00/0 : 17[46000] -> 18[85000] via P2P/IPC +gpub013:1454153:1454219 [1] NCCL INFO Channel 01/0 : 17[46000] -> 18[85000] via P2P/IPC +gpub013:1454153:1454219 [1] NCCL INFO Connected all rings +gpub013:1454153:1454219 [1] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [receive] via NET/IB/0 +gpub013:1454153:1454219 [1] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [send] via NET/IB/0 +gpub013:1454153:1454219 [1] NCCL INFO Channel 00/0 : 17[46000] -> 16[7000] via P2P/IPC +gpub013:1454153:1454219 [1] NCCL INFO Channel 01/0 : 17[46000] -> 16[7000] via P2P/IPC +gpub013:1454153:1454219 [1] NCCL INFO Connected all trees +gpub013:1454153:1454219 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub013:1454153:1454219 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub013:1454153:1454219 [1] NCCL INFO comm 0x504a7bd0 rank 17 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub075:323056:323056 [1] NCCL INFO cudaDriverVersion 12010 +gpub075:323056:323056 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.175<0> +gpub075:323056:323056 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub075:323056:323126 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.175<0> +gpub075:323056:323126 [1] NCCL INFO Using network IB +gpub075:323056:323126 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub075:323056:323126 [1] NCCL INFO Trees [0] 98/80/-1->97->96 [1] 98/-1/-1->97->96 +gpub075:323056:323126 [1] NCCL INFO Channel 00/0 : 97[46000] -> 98[85000] via P2P/IPC +gpub075:323056:323126 [1] NCCL INFO Channel 01/0 : 97[46000] -> 98[85000] via P2P/IPC +gpub075:323056:323126 [1] NCCL INFO Connected all rings +gpub075:323056:323126 [1] NCCL INFO Channel 00/0 : 80[7000] -> 97[46000] [receive] via NET/IB/0 +gpub075:323056:323126 [1] NCCL INFO Channel 00/0 : 97[46000] -> 80[7000] [send] via NET/IB/0 +gpub075:323056:323126 [1] NCCL INFO Channel 00/0 : 97[46000] -> 96[7000] via P2P/IPC +gpub075:323056:323126 [1] NCCL INFO Channel 01/0 : 97[46000] -> 96[7000] via P2P/IPC +gpub075:323056:323126 [1] NCCL INFO Connected all trees +gpub075:323056:323126 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub075:323056:323126 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub075:323056:323126 [1] NCCL INFO comm 0xa49e4e0 rank 97 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub065:1317227:1317227 [0] NCCL INFO cudaDriverVersion 12010 +gpub065:1317227:1317227 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.165<0> +gpub065:1317227:1317227 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub065:1317227:1317294 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.165<0> +gpub065:1317227:1317294 [0] NCCL INFO Using network IB +gpub065:1317227:1317294 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub065:1317227:1317294 [0] NCCL INFO Trees [0] 81/88/-1->80->97 [1] 81/-1/-1->80->84 +gpub065:1317227:1317294 [0] NCCL INFO Channel 00/0 : 79[c7000] -> 80[7000] [receive] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 01/0 : 79[c7000] -> 80[7000] [receive] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 00/0 : 80[7000] -> 81[46000] via P2P/IPC +gpub065:1317227:1317294 [0] NCCL INFO Channel 01/0 : 80[7000] -> 81[46000] via P2P/IPC +gpub065:1317227:1317294 [0] NCCL INFO Connected all rings +gpub065:1317227:1317294 [0] NCCL INFO Channel 01/0 : 80[7000] -> 84[7000] [send] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 00/0 : 80[7000] -> 88[7000] [send] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 00/0 : 80[7000] -> 97[46000] [send] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 00/0 : 97[46000] -> 80[7000] [receive] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 00/0 : 88[7000] -> 80[7000] [receive] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Channel 01/0 : 84[7000] -> 80[7000] [receive] via NET/IB/0 +gpub065:1317227:1317294 [0] NCCL INFO Connected all trees +gpub065:1317227:1317294 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub065:1317227:1317294 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub065:1317227:1317294 [0] NCCL INFO comm 0xb6488400 rank 80 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub068:1244814:1244814 [2] NCCL INFO cudaDriverVersion 12010 +gpub068:1244814:1244814 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.168<0> +gpub068:1244814:1244814 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub068:1244814:1244875 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.168<0> +gpub068:1244814:1244875 [2] NCCL INFO Using network IB +gpub068:1244814:1244875 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub068:1244814:1244875 [2] NCCL INFO Trees [0] 95/-1/-1->94->93 [1] 95/-1/-1->94->93 +gpub068:1244814:1244875 [2] NCCL INFO Channel 00/0 : 94[85000] -> 95[c7000] via P2P/IPC +gpub068:1244814:1244875 [2] NCCL INFO Channel 01/0 : 94[85000] -> 95[c7000] via P2P/IPC +gpub068:1244814:1244875 [2] NCCL INFO Connected all rings +gpub068:1244814:1244875 [2] NCCL INFO Channel 00/0 : 94[85000] -> 93[46000] via P2P/IPC +gpub068:1244814:1244875 [2] NCCL INFO Channel 01/0 : 94[85000] -> 93[46000] via P2P/IPC +gpub068:1244814:1244875 [2] NCCL INFO Connected all trees +gpub068:1244814:1244875 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub068:1244814:1244875 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub068:1244814:1244875 [2] NCCL INFO comm 0xbeb7020 rank 94 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub068:1244815:1244815 [3] NCCL INFO cudaDriverVersion 12010 +gpub068:1244815:1244815 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.168<0> +gpub068:1244815:1244815 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub068:1244815:1244878 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.168<0> +gpub068:1244815:1244878 [3] NCCL INFO Using network IB +gpub068:1244815:1244878 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub068:1244815:1244878 [3] NCCL INFO Trees [0] -1/-1/-1->95->94 [1] -1/-1/-1->95->94 +gpub068:1244815:1244878 [3] NCCL INFO Channel 00/0 : 95[c7000] -> 96[7000] [send] via NET/IB/0 +gpub068:1244815:1244878 [3] NCCL INFO Channel 01/0 : 95[c7000] -> 96[7000] [send] via NET/IB/0 +gpub068:1244815:1244878 [3] NCCL INFO Connected all rings +gpub068:1244815:1244878 [3] NCCL INFO Channel 00/0 : 95[c7000] -> 94[85000] via P2P/IPC +gpub068:1244815:1244878 [3] NCCL INFO Channel 01/0 : 95[c7000] -> 94[85000] via P2P/IPC +gpub068:1244815:1244878 [3] NCCL INFO Connected all trees +gpub068:1244815:1244878 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub068:1244815:1244878 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub068:1244815:1244878 [3] NCCL INFO comm 0x8e4e7950 rank 95 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub058:1406058:1406058 [2] NCCL INFO cudaDriverVersion 12010 +gpub058:1406058:1406058 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.158<0> +gpub058:1406058:1406058 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub058:1406058:1406115 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.158<0> +gpub058:1406058:1406115 [2] NCCL INFO Using network IB +gpub058:1406058:1406115 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub058:1406058:1406115 [2] NCCL INFO Trees [0] 67/-1/-1->66->65 [1] 67/-1/-1->66->65 +gpub058:1406058:1406115 [2] NCCL INFO Channel 00/0 : 66[85000] -> 67[c7000] via P2P/IPC +gpub058:1406058:1406115 [2] NCCL INFO Channel 01/0 : 66[85000] -> 67[c7000] via P2P/IPC +gpub058:1406058:1406115 [2] NCCL INFO Connected all rings +gpub058:1406058:1406115 [2] NCCL INFO Channel 00/0 : 66[85000] -> 65[46000] via P2P/IPC +gpub058:1406058:1406115 [2] NCCL INFO Channel 01/0 : 66[85000] -> 65[46000] via P2P/IPC +gpub058:1406058:1406115 [2] NCCL INFO Connected all trees +gpub058:1406058:1406115 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub058:1406058:1406115 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub058:1406058:1406115 [2] NCCL INFO comm 0xb9766da0 rank 66 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub001:279948:280013 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:279948:280013 [0] NCCL INFO Using network IB +gpub001:279948:280013 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub001:279948:280013 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpub001:279948:280013 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpub001:279948:280013 [0] NCCL INFO Trees [0] 1/64/-1->0->-1 [1] 1/-1/-1->0->4 +gpub001:279948:280013 [0] NCCL INFO Channel 00/0 : 127[c7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:279948:280013 [0] NCCL INFO Channel 01/0 : 127[c7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:279948:280013 [0] NCCL INFO Channel 00/0 : 0[7000] -> 1[46000] via P2P/IPC +gpub001:279948:280013 [0] NCCL INFO Channel 01/0 : 0[7000] -> 1[46000] via P2P/IPC +gpub001:279948:280013 [0] NCCL INFO Connected all rings +gpub001:279948:280013 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [send] via NET/IB/0 +gpub001:279948:280013 [0] NCCL INFO Channel 00/0 : 64[7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:279948:280013 [0] NCCL INFO Channel 00/0 : 0[7000] -> 64[7000] [send] via NET/IB/0 +gpub001:279948:280013 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [receive] via NET/IB/0 +gpub001:279948:280013 [0] NCCL INFO Connected all trees +gpub001:279948:280013 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub001:279948:280013 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:279948:280013 [0] NCCL INFO comm 0x4fb35be0 rank 0 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub075:323057:323057 [2] NCCL INFO cudaDriverVersion 12010 +gpub075:323057:323057 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.175<0> +gpub075:323057:323057 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub075:323057:323128 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.175<0> +gpub075:323057:323128 [2] NCCL INFO Using network IB +gpub075:323057:323128 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub075:323057:323128 [2] NCCL INFO Trees [0] 99/-1/-1->98->97 [1] 99/-1/-1->98->97 +gpub075:323057:323128 [2] NCCL INFO Channel 00/0 : 98[85000] -> 99[c7000] via P2P/IPC +gpub075:323057:323128 [2] NCCL INFO Channel 01/0 : 98[85000] -> 99[c7000] via P2P/IPC +gpub075:323057:323128 [2] NCCL INFO Connected all rings +gpub075:323057:323128 [2] NCCL INFO Channel 00/0 : 98[85000] -> 97[46000] via P2P/IPC +gpub075:323057:323128 [2] NCCL INFO Channel 01/0 : 98[85000] -> 97[46000] via P2P/IPC +gpub075:323057:323128 [2] NCCL INFO Connected all trees +gpub075:323057:323128 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub075:323057:323128 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub075:323057:323128 [2] NCCL INFO comm 0xb913a860 rank 98 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub061:1342467:1342467 [1] NCCL INFO cudaDriverVersion 12010 +gpub061:1342467:1342467 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.161<0> +gpub061:1342467:1342467 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub061:1342467:1342541 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.161<0> +gpub061:1342467:1342541 [1] NCCL INFO Using network IB +gpub061:1342467:1342541 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub061:1342467:1342541 [1] NCCL INFO Trees [0] 74/68/-1->73->72 [1] 74/-1/-1->73->72 +gpub061:1342467:1342541 [1] NCCL INFO Channel 00/0 : 73[46000] -> 74[85000] via P2P/IPC +gpub061:1342467:1342541 [1] NCCL INFO Channel 01/0 : 73[46000] -> 74[85000] via P2P/IPC +gpub061:1342467:1342541 [1] NCCL INFO Connected all rings +gpub061:1342467:1342541 [1] NCCL INFO Channel 00/0 : 68[7000] -> 73[46000] [receive] via NET/IB/0 +gpub061:1342467:1342541 [1] NCCL INFO Channel 00/0 : 73[46000] -> 68[7000] [send] via NET/IB/0 +gpub061:1342467:1342541 [1] NCCL INFO Channel 00/0 : 73[46000] -> 72[7000] via P2P/IPC +gpub061:1342467:1342541 [1] NCCL INFO Channel 01/0 : 73[46000] -> 72[7000] via P2P/IPC +gpub061:1342467:1342541 [1] NCCL INFO Connected all trees +gpub061:1342467:1342541 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub061:1342467:1342541 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub061:1342467:1342541 [1] NCCL INFO comm 0x50772b20 rank 73 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub001:279950:279950 [2] NCCL INFO cudaDriverVersion 12010 +gpub001:279950:279950 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:279950:279950 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:279950:280016 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:279950:280016 [2] NCCL INFO Using network IB +gpub001:279950:280016 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub001:279950:280016 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +gpub001:279950:280016 [2] NCCL INFO Channel 00/0 : 2[85000] -> 3[c7000] via P2P/IPC +gpub001:279950:280016 [2] NCCL INFO Channel 01/0 : 2[85000] -> 3[c7000] via P2P/IPC +gpub001:279950:280016 [2] NCCL INFO Connected all rings +gpub001:279950:280016 [2] NCCL INFO Channel 00/0 : 2[85000] -> 1[46000] via P2P/IPC +gpub001:279950:280016 [2] NCCL INFO Channel 01/0 : 2[85000] -> 1[46000] via P2P/IPC +gpub001:279950:280016 [2] NCCL INFO Connected all trees +gpub001:279950:280016 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub001:279950:280016 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:279950:280016 [2] NCCL INFO comm 0x8c644f40 rank 2 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub091:1688862:1688862 [0] NCCL INFO cudaDriverVersion 12010 +gpub091:1688862:1688862 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.191<0> +gpub091:1688862:1688862 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub091:1688862:1688930 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.191<0> +gpub091:1688862:1688930 [0] NCCL INFO Using network IB +gpub091:1688862:1688930 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub091:1688862:1688930 [0] NCCL INFO Trees [0] 125/-1/-1->124->120 [1] 125/60/-1->124->-1 +gpub091:1688862:1688930 [0] NCCL INFO Channel 00/0 : 123[c7000] -> 124[7000] [receive] via NET/IB/0 +gpub091:1688862:1688930 [0] NCCL INFO Channel 01/0 : 123[c7000] -> 124[7000] [receive] via NET/IB/0 +gpub091:1688862:1688930 [0] NCCL INFO Channel 00/0 : 124[7000] -> 125[46000] via P2P/IPC +gpub091:1688862:1688930 [0] NCCL INFO Channel 01/0 : 124[7000] -> 125[46000] via P2P/IPC +gpub091:1688862:1688930 [0] NCCL INFO Connected all rings +gpub091:1688862:1688930 [0] NCCL INFO Channel 00/0 : 120[7000] -> 124[7000] [receive] via NET/IB/0 +gpub091:1688862:1688930 [0] NCCL INFO Channel 01/0 : 60[7000] -> 124[7000] [receive] via NET/IB/0 +gpub091:1688862:1688930 [0] NCCL INFO Channel 01/0 : 124[7000] -> 60[7000] [send] via NET/IB/0 +gpub091:1688862:1688930 [0] NCCL INFO Channel 00/0 : 124[7000] -> 120[7000] [send] via NET/IB/0 +gpub091:1688862:1688930 [0] NCCL INFO Connected all trees +gpub091:1688862:1688930 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub091:1688862:1688930 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub091:1688862:1688930 [0] NCCL INFO comm 0xb5c78140 rank 124 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub061:1342466:1342466 [0] NCCL INFO cudaDriverVersion 12010 +gpub061:1342466:1342466 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.161<0> +gpub061:1342466:1342466 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub061:1342466:1342539 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.161<0> +gpub061:1342466:1342539 [0] NCCL INFO Using network IB +gpub061:1342466:1342539 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub061:1342466:1342539 [0] NCCL INFO Trees [0] 73/76/-1->72->81 [1] 73/-1/-1->72->69 +gpub061:1342466:1342539 [0] NCCL INFO Channel 00/0 : 71[c7000] -> 72[7000] [receive] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 01/0 : 71[c7000] -> 72[7000] [receive] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 00/0 : 72[7000] -> 73[46000] via P2P/IPC +gpub061:1342466:1342539 [0] NCCL INFO Channel 01/0 : 72[7000] -> 73[46000] via P2P/IPC +gpub061:1342466:1342539 [0] NCCL INFO Connected all rings +gpub061:1342466:1342539 [0] NCCL INFO Channel 01/0 : 69[46000] -> 72[7000] [receive] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 00/0 : 72[7000] -> 76[7000] [send] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 00/0 : 72[7000] -> 81[46000] [send] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 00/0 : 81[46000] -> 72[7000] [receive] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 00/0 : 76[7000] -> 72[7000] [receive] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Channel 01/0 : 72[7000] -> 69[46000] [send] via NET/IB/0 +gpub061:1342466:1342539 [0] NCCL INFO Connected all trees +gpub061:1342466:1342539 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub061:1342466:1342539 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub061:1342466:1342539 [0] NCCL INFO comm 0x8c41510 rank 72 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub090:1179805:1179805 [2] NCCL INFO cudaDriverVersion 12010 +gpub090:1179805:1179805 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.190<0> +gpub090:1179805:1179805 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub090:1179805:1179868 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.190<0> +gpub090:1179805:1179868 [2] NCCL INFO Using network IB +gpub090:1179805:1179868 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub090:1179805:1179868 [2] NCCL INFO Trees [0] 123/-1/-1->122->121 [1] 123/-1/-1->122->121 +gpub090:1179805:1179868 [2] NCCL INFO Channel 00/0 : 122[85000] -> 123[c7000] via P2P/IPC +gpub090:1179805:1179868 [2] NCCL INFO Channel 01/0 : 122[85000] -> 123[c7000] via P2P/IPC +gpub090:1179805:1179868 [2] NCCL INFO Connected all rings +gpub090:1179805:1179868 [2] NCCL INFO Channel 00/0 : 122[85000] -> 121[46000] via P2P/IPC +gpub090:1179805:1179868 [2] NCCL INFO Channel 01/0 : 122[85000] -> 121[46000] via P2P/IPC +gpub075:323058:323058 [3] NCCL INFO cudaDriverVersion 12010 +gpub075:323058:323058 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.175<0> +gpub075:323058:323058 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub075:323058:323127 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.175<0> +gpub075:323058:323127 [3] NCCL INFO Using network IB +gpub075:323058:323127 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub075:323058:323127 [3] NCCL INFO Trees [0] -1/-1/-1->99->98 [1] -1/-1/-1->99->98 +gpub075:323058:323127 [3] NCCL INFO Channel 00/0 : 99[c7000] -> 100[7000] [send] via NET/IB/0 +gpub075:323058:323127 [3] NCCL INFO Channel 01/0 : 99[c7000] -> 100[7000] [send] via NET/IB/0 +gpub075:323058:323127 [3] NCCL INFO Connected all rings +gpub075:323058:323127 [3] NCCL INFO Channel 00/0 : 99[c7000] -> 98[85000] via P2P/IPC +gpub075:323058:323127 [3] NCCL INFO Channel 01/0 : 99[c7000] -> 98[85000] via P2P/IPC +gpub090:1179805:1179868 [2] NCCL INFO Connected all trees +gpub090:1179805:1179868 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub090:1179805:1179868 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub090:1179805:1179868 [2] NCCL INFO comm 0x50f1c1b0 rank 122 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub075:323058:323127 [3] NCCL INFO Connected all trees +gpub075:323058:323127 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub075:323058:323127 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub075:323058:323127 [3] NCCL INFO comm 0x50ae89c0 rank 99 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub091:1688864:1688864 [2] NCCL INFO cudaDriverVersion 12010 +gpub091:1688864:1688864 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.191<0> +gpub091:1688864:1688864 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub091:1688864:1688932 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.191<0> +gpub091:1688864:1688932 [2] NCCL INFO Using network IB +gpub091:1688864:1688932 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub091:1688864:1688932 [2] NCCL INFO Trees [0] 127/-1/-1->126->125 [1] 127/-1/-1->126->125 +gpub091:1688864:1688932 [2] NCCL INFO Channel 00/0 : 126[85000] -> 127[c7000] via P2P/IPC +gpub091:1688864:1688932 [2] NCCL INFO Channel 01/0 : 126[85000] -> 127[c7000] via P2P/IPC +gpub091:1688864:1688932 [2] NCCL INFO Connected all rings +gpub091:1688864:1688932 [2] NCCL INFO Channel 00/0 : 126[85000] -> 125[46000] via P2P/IPC +gpub091:1688864:1688932 [2] NCCL INFO Channel 01/0 : 126[85000] -> 125[46000] via P2P/IPC +gpub091:1688864:1688932 [2] NCCL INFO Connected all trees +gpub091:1688864:1688932 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub091:1688864:1688932 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub091:1688864:1688932 [2] NCCL INFO comm 0x51d52410 rank 126 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub058:1406059:1406059 [3] NCCL INFO cudaDriverVersion 12010 +gpub058:1406059:1406059 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.158<0> +gpub058:1406059:1406059 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub058:1406059:1406116 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.158<0> +gpub058:1406059:1406116 [3] NCCL INFO Using network IB +gpub058:1406059:1406116 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub058:1406059:1406116 [3] NCCL INFO Trees [0] -1/-1/-1->67->66 [1] -1/-1/-1->67->66 +gpub058:1406059:1406116 [3] NCCL INFO Channel 00/0 : 67[c7000] -> 68[7000] [send] via NET/IB/0 +gpub058:1406059:1406116 [3] NCCL INFO Channel 01/0 : 67[c7000] -> 68[7000] [send] via NET/IB/0 +gpub058:1406059:1406116 [3] NCCL INFO Connected all rings +gpub058:1406059:1406116 [3] NCCL INFO Channel 00/0 : 67[c7000] -> 66[85000] via P2P/IPC +gpub058:1406059:1406116 [3] NCCL INFO Channel 01/0 : 67[c7000] -> 66[85000] via P2P/IPC +gpub058:1406059:1406116 [3] NCCL INFO Connected all trees +gpub058:1406059:1406116 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub058:1406059:1406116 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub058:1406059:1406116 [3] NCCL INFO comm 0x5127edf0 rank 67 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub061:1342468:1342468 [2] NCCL INFO cudaDriverVersion 12010 +gpub061:1342468:1342468 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.161<0> +gpub061:1342468:1342468 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub061:1342468:1342540 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.161<0> +gpub061:1342468:1342540 [2] NCCL INFO Using network IB +gpub061:1342468:1342540 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub061:1342468:1342540 [2] NCCL INFO Trees [0] 75/-1/-1->74->73 [1] 75/-1/-1->74->73 +gpub061:1342468:1342540 [2] NCCL INFO Channel 00/0 : 74[85000] -> 75[c7000] via P2P/IPC +gpub061:1342468:1342540 [2] NCCL INFO Channel 01/0 : 74[85000] -> 75[c7000] via P2P/IPC +gpub061:1342468:1342540 [2] NCCL INFO Connected all rings +gpub061:1342468:1342540 [2] NCCL INFO Channel 00/0 : 74[85000] -> 73[46000] via P2P/IPC +gpub061:1342468:1342540 [2] NCCL INFO Channel 01/0 : 74[85000] -> 73[46000] via P2P/IPC +gpub061:1342468:1342540 [2] NCCL INFO Connected all trees +gpub061:1342468:1342540 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub061:1342468:1342540 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub061:1342468:1342540 [2] NCCL INFO comm 0xa2ca84b0 rank 74 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub061:1342469:1342469 [3] NCCL INFO cudaDriverVersion 12010 +gpub061:1342469:1342469 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.161<0> +gpub061:1342469:1342469 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub061:1342469:1342542 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.161<0> +gpub061:1342469:1342542 [3] NCCL INFO Using network IB +gpub061:1342469:1342542 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub061:1342469:1342542 [3] NCCL INFO Trees [0] -1/-1/-1->75->74 [1] -1/-1/-1->75->74 +gpub061:1342469:1342542 [3] NCCL INFO Channel 00/0 : 75[c7000] -> 76[7000] [send] via NET/IB/0 +gpub061:1342469:1342542 [3] NCCL INFO Channel 01/0 : 75[c7000] -> 76[7000] [send] via NET/IB/0 +gpub061:1342469:1342542 [3] NCCL INFO Connected all rings +gpub061:1342469:1342542 [3] NCCL INFO Channel 00/0 : 75[c7000] -> 74[85000] via P2P/IPC +gpub061:1342469:1342542 [3] NCCL INFO Channel 01/0 : 75[c7000] -> 74[85000] via P2P/IPC +gpub075:323055:323055 [0] NCCL INFO cudaDriverVersion 12010 +gpub075:323055:323055 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.175<0> +gpub075:323055:323055 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub075:323055:323125 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.175<0> +gpub075:323055:323125 [0] NCCL INFO Using network IB +gpub075:323055:323125 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub075:323055:323125 [0] NCCL INFO Trees [0] 97/112/-1->96->64 [1] 97/-1/-1->96->100 +gpub075:323055:323125 [0] NCCL INFO Channel 00/0 : 95[c7000] -> 96[7000] [receive] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 01/0 : 95[c7000] -> 96[7000] [receive] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 00/0 : 96[7000] -> 97[46000] via P2P/IPC +gpub075:323055:323125 [0] NCCL INFO Channel 01/0 : 96[7000] -> 97[46000] via P2P/IPC +gpub075:323055:323125 [0] NCCL INFO Connected all rings +gpub061:1342469:1342542 [3] NCCL INFO Connected all trees +gpub061:1342469:1342542 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub061:1342469:1342542 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub061:1342469:1342542 [3] NCCL INFO comm 0xb6128330 rank 75 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub075:323055:323125 [0] NCCL INFO Channel 01/0 : 96[7000] -> 100[7000] [send] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 00/0 : 96[7000] -> 112[7000] [send] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 00/0 : 64[7000] -> 96[7000] [receive] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 00/0 : 96[7000] -> 64[7000] [send] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 00/0 : 112[7000] -> 96[7000] [receive] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Channel 01/0 : 100[7000] -> 96[7000] [receive] via NET/IB/0 +gpub075:323055:323125 [0] NCCL INFO Connected all trees +gpub075:323055:323125 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub075:323055:323125 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub075:323055:323125 [0] NCCL INFO comm 0x4fef3c60 rank 96 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub090:1179803:1179803 [0] NCCL INFO cudaDriverVersion 12010 +gpub090:1179803:1179803 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.190<0> +gpub090:1179803:1179803 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub090:1179803:1179871 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.190<0> +gpub090:1179803:1179871 [0] NCCL INFO Using network IB +gpub090:1179803:1179871 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub090:1179803:1179871 [0] NCCL INFO Trees [0] 121/124/-1->120->112 [1] 121/-1/-1->120->117 +gpub090:1179803:1179871 [0] NCCL INFO Channel 00/0 : 119[c7000] -> 120[7000] [receive] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 01/0 : 119[c7000] -> 120[7000] [receive] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 00/0 : 120[7000] -> 121[46000] via P2P/IPC +gpub090:1179803:1179871 [0] NCCL INFO Channel 01/0 : 120[7000] -> 121[46000] via P2P/IPC +gpub090:1179803:1179871 [0] NCCL INFO Connected all rings +gpub090:1179803:1179871 [0] NCCL INFO Channel 01/0 : 117[46000] -> 120[7000] [receive] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 00/0 : 120[7000] -> 124[7000] [send] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 00/0 : 112[7000] -> 120[7000] [receive] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 00/0 : 120[7000] -> 112[7000] [send] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 00/0 : 124[7000] -> 120[7000] [receive] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Channel 01/0 : 120[7000] -> 117[46000] [send] via NET/IB/0 +gpub090:1179803:1179871 [0] NCCL INFO Connected all trees +gpub090:1179803:1179871 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub090:1179803:1179871 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub090:1179803:1179871 [0] NCCL INFO comm 0x521aa9c0 rank 120 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub080:3990802:3990802 [3] NCCL INFO cudaDriverVersion 12010 +gpub080:3990802:3990802 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:3990802:3990802 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:3990802:3990869 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:3990802:3990869 [3] NCCL INFO Using network IB +gpub080:3990802:3990869 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub080:3990802:3990869 [3] NCCL INFO Trees [0] -1/-1/-1->103->102 [1] -1/-1/-1->103->102 +gpub080:3990802:3990869 [3] NCCL INFO Channel 00/0 : 103[c7000] -> 104[7000] [send] via NET/IB/0 +gpub080:3990802:3990869 [3] NCCL INFO Channel 01/0 : 103[c7000] -> 104[7000] [send] via NET/IB/0 +gpub080:3990802:3990869 [3] NCCL INFO Connected all rings +gpub080:3990802:3990869 [3] NCCL INFO Channel 00/0 : 103[c7000] -> 102[85000] via P2P/IPC +gpub080:3990802:3990869 [3] NCCL INFO Channel 01/0 : 103[c7000] -> 102[85000] via P2P/IPC +gpub080:3990802:3990869 [3] NCCL INFO Connected all trees +gpub080:3990802:3990869 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub080:3990802:3990869 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:3990802:3990869 [3] NCCL INFO comm 0xb6ed4500 rank 103 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub090:1179804:1179804 [1] NCCL INFO cudaDriverVersion 12010 +gpub090:1179804:1179804 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.190<0> +gpub090:1179804:1179804 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub090:1179804:1179870 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.190<0> +gpub090:1179804:1179870 [1] NCCL INFO Using network IB +gpub090:1179804:1179870 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub090:1179804:1179870 [1] NCCL INFO Trees [0] 122/116/-1->121->120 [1] 122/-1/-1->121->120 +gpub090:1179804:1179870 [1] NCCL INFO Channel 00/0 : 121[46000] -> 122[85000] via P2P/IPC +gpub090:1179804:1179870 [1] NCCL INFO Channel 01/0 : 121[46000] -> 122[85000] via P2P/IPC +gpub090:1179804:1179870 [1] NCCL INFO Connected all rings +gpub090:1179804:1179870 [1] NCCL INFO Channel 00/0 : 116[7000] -> 121[46000] [receive] via NET/IB/0 +gpub090:1179804:1179870 [1] NCCL INFO Channel 00/0 : 121[46000] -> 116[7000] [send] via NET/IB/0 +gpub090:1179804:1179870 [1] NCCL INFO Channel 00/0 : 121[46000] -> 120[7000] via P2P/IPC +gpub090:1179804:1179870 [1] NCCL INFO Channel 01/0 : 121[46000] -> 120[7000] via P2P/IPC +gpub090:1179804:1179870 [1] NCCL INFO Connected all trees +gpub090:1179804:1179870 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub090:1179804:1179870 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub090:1179804:1179870 [1] NCCL INFO comm 0x50af0d60 rank 121 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub091:1688863:1688863 [1] NCCL INFO cudaDriverVersion 12010 +gpub091:1688863:1688863 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.191<0> +gpub091:1688863:1688863 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub091:1688863:1688931 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.191<0> +gpub091:1688863:1688931 [1] NCCL INFO Using network IB +gpub091:1688863:1688931 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub091:1688863:1688931 [1] NCCL INFO Trees [0] 126/-1/-1->125->124 [1] 126/-1/-1->125->124 +gpub091:1688863:1688931 [1] NCCL INFO Channel 00/0 : 125[46000] -> 126[85000] via P2P/IPC +gpub091:1688863:1688931 [1] NCCL INFO Channel 01/0 : 125[46000] -> 126[85000] via P2P/IPC +gpub091:1688863:1688931 [1] NCCL INFO Connected all rings +gpub091:1688863:1688931 [1] NCCL INFO Channel 00/0 : 125[46000] -> 124[7000] via P2P/IPC +gpub091:1688863:1688931 [1] NCCL INFO Channel 01/0 : 125[46000] -> 124[7000] via P2P/IPC +gpub091:1688863:1688931 [1] NCCL INFO Connected all trees +gpub091:1688863:1688931 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub091:1688863:1688931 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub091:1688863:1688931 [1] NCCL INFO comm 0x50bc8450 rank 125 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub090:1179806:1179806 [3] NCCL INFO cudaDriverVersion 12010 +gpub090:1179806:1179806 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.190<0> +gpub090:1179806:1179806 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub090:1179806:1179869 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.190<0> +gpub090:1179806:1179869 [3] NCCL INFO Using network IB +gpub090:1179806:1179869 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub090:1179806:1179869 [3] NCCL INFO Trees [0] -1/-1/-1->123->122 [1] -1/-1/-1->123->122 +gpub090:1179806:1179869 [3] NCCL INFO Channel 00/0 : 123[c7000] -> 124[7000] [send] via NET/IB/0 +gpub090:1179806:1179869 [3] NCCL INFO Channel 01/0 : 123[c7000] -> 124[7000] [send] via NET/IB/0 +gpub090:1179806:1179869 [3] NCCL INFO Connected all rings +gpub090:1179806:1179869 [3] NCCL INFO Channel 00/0 : 123[c7000] -> 122[85000] via P2P/IPC +gpub090:1179806:1179869 [3] NCCL INFO Channel 01/0 : 123[c7000] -> 122[85000] via P2P/IPC +gpub090:1179806:1179869 [3] NCCL INFO Connected all trees +gpub090:1179806:1179869 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub090:1179806:1179869 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub090:1179806:1179869 [3] NCCL INFO comm 0x5049f1b0 rank 123 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub058:1406057:1406057 [1] NCCL INFO cudaDriverVersion 12010 +gpub058:1406057:1406057 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.158<0> +gpub058:1406057:1406057 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub058:1406057:1406117 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.158<0> +gpub058:1406057:1406117 [1] NCCL INFO Using network IB +gpub058:1406057:1406117 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub058:1406057:1406117 [1] NCCL INFO Trees [0] 66/32/-1->65->64 [1] 66/-1/-1->65->64 +gpub058:1406057:1406117 [1] NCCL INFO Channel 00/0 : 65[46000] -> 66[85000] via P2P/IPC +gpub058:1406057:1406117 [1] NCCL INFO Channel 01/0 : 65[46000] -> 66[85000] via P2P/IPC +gpub058:1406057:1406117 [1] NCCL INFO Connected all rings +gpub058:1406057:1406117 [1] NCCL INFO Channel 00/0 : 32[7000] -> 65[46000] [receive] via NET/IB/0 +gpub058:1406057:1406117 [1] NCCL INFO Channel 00/0 : 65[46000] -> 32[7000] [send] via NET/IB/0 +gpub058:1406057:1406117 [1] NCCL INFO Channel 00/0 : 65[46000] -> 64[7000] via P2P/IPC +gpub058:1406057:1406117 [1] NCCL INFO Channel 01/0 : 65[46000] -> 64[7000] via P2P/IPC +gpub058:1406057:1406117 [1] NCCL INFO Connected all trees +gpub058:1406057:1406117 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub058:1406057:1406117 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub058:1406057:1406117 [1] NCCL INFO comm 0x503d1e40 rank 65 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub091:1688865:1688865 [3] NCCL INFO cudaDriverVersion 12010 +gpub091:1688865:1688865 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.191<0> +gpub091:1688865:1688865 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub091:1688865:1688929 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.191<0> +gpub091:1688865:1688929 [3] NCCL INFO Using network IB +gpub091:1688865:1688929 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub091:1688865:1688929 [3] NCCL INFO Trees [0] -1/-1/-1->127->126 [1] -1/-1/-1->127->126 +gpub091:1688865:1688929 [3] NCCL INFO Channel 00/0 : 127[c7000] -> 0[7000] [send] via NET/IB/0 +gpub091:1688865:1688929 [3] NCCL INFO Channel 01/0 : 127[c7000] -> 0[7000] [send] via NET/IB/0 +gpub091:1688865:1688929 [3] NCCL INFO Connected all rings +gpub091:1688865:1688929 [3] NCCL INFO Channel 00/0 : 127[c7000] -> 126[85000] via P2P/IPC +gpub091:1688865:1688929 [3] NCCL INFO Channel 01/0 : 127[c7000] -> 126[85000] via P2P/IPC +gpub091:1688865:1688929 [3] NCCL INFO Connected all trees +gpub091:1688865:1688929 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub091:1688865:1688929 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub091:1688865:1688929 [3] NCCL INFO comm 0x8ce193d0 rank 127 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub001:279949:279949 [1] NCCL INFO cudaDriverVersion 12010 +gpub001:279949:279949 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:279949:279949 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:279949:280015 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:279949:280015 [1] NCCL INFO Using network IB +gpub001:279949:280015 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub001:279949:280015 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +gpub001:279949:280015 [1] NCCL INFO Channel 00/0 : 1[46000] -> 2[85000] via P2P/IPC +gpub001:279949:280015 [1] NCCL INFO Channel 01/0 : 1[46000] -> 2[85000] via P2P/IPC +gpub001:279949:280015 [1] NCCL INFO Connected all rings +gpub001:279949:280015 [1] NCCL INFO Channel 00/0 : 1[46000] -> 0[7000] via P2P/IPC +gpub001:279949:280015 [1] NCCL INFO Channel 01/0 : 1[46000] -> 0[7000] via P2P/IPC +gpub001:279949:280015 [1] NCCL INFO Connected all trees +gpub001:279949:280015 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub001:279949:280015 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:279949:280015 [1] NCCL INFO comm 0x8b9ccb80 rank 1 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub035:2421441:2421441 [0] NCCL INFO cudaDriverVersion 12010 +gpub035:2421441:2421441 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.135<0> +gpub035:2421441:2421441 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub035:2421441:2421510 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.135<0> +gpub035:2421441:2421510 [0] NCCL INFO Using network IB +gpub035:2421441:2421510 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub035:2421441:2421510 [0] NCCL INFO Trees [0] 41/44/-1->40->49 [1] 41/-1/-1->40->37 +gpub035:2421441:2421510 [0] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 00/0 : 40[7000] -> 41[46000] via P2P/IPC +gpub035:2421441:2421510 [0] NCCL INFO Channel 01/0 : 40[7000] -> 41[46000] via P2P/IPC +gpub035:2421441:2421510 [0] NCCL INFO Connected all rings +gpub035:2421441:2421510 [0] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [receive] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [send] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [send] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [receive] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [receive] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [send] via NET/IB/0 +gpub035:2421441:2421510 [0] NCCL INFO Connected all trees +gpub035:2421441:2421510 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub035:2421441:2421510 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub035:2421441:2421510 [0] NCCL INFO comm 0x8ca3dc20 rank 40 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub001:279951:279951 [3] NCCL INFO cudaDriverVersion 12010 +gpub001:279951:279951 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.101<0> +gpub001:279951:279951 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub001:279951:280014 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.101<0> +gpub001:279951:280014 [3] NCCL INFO Using network IB +gpub001:279951:280014 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub001:279951:280014 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +gpub001:279951:280014 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0 +gpub001:279951:280014 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0 +gpub001:279951:280014 [3] NCCL INFO Connected all rings +gpub001:279951:280014 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 2[85000] via P2P/IPC +gpub001:279951:280014 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 2[85000] via P2P/IPC +gpub001:279951:280014 [3] NCCL INFO Connected all trees +gpub001:279951:280014 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub001:279951:280014 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub001:279951:280014 [3] NCCL INFO comm 0x8d9eaa40 rank 3 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub085:1471917:1471917 [1] NCCL INFO cudaDriverVersion 12010 +gpub085:1471917:1471917 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.185<0> +gpub085:1471917:1471917 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub085:1471917:1471987 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.185<0> +gpub085:1471917:1471987 [1] NCCL INFO Using network IB +gpub085:1471917:1471987 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub085:1471917:1471987 [1] NCCL INFO Trees [0] 110/-1/-1->109->108 [1] 110/116/-1->109->108 +gpub085:1471917:1471987 [1] NCCL INFO Channel 00/0 : 109[46000] -> 110[85000] via P2P/IPC +gpub085:1471917:1471987 [1] NCCL INFO Channel 01/0 : 109[46000] -> 110[85000] via P2P/IPC +gpub085:1471917:1471987 [1] NCCL INFO Connected all rings +gpub085:1471917:1471987 [1] NCCL INFO Channel 01/0 : 109[46000] -> 116[7000] [send] via NET/IB/0 +gpub085:1471917:1471987 [1] NCCL INFO Channel 01/0 : 116[7000] -> 109[46000] [receive] via NET/IB/0 +gpub085:1471917:1471987 [1] NCCL INFO Channel 00/0 : 109[46000] -> 108[7000] via P2P/IPC +gpub085:1471917:1471987 [1] NCCL INFO Channel 01/0 : 109[46000] -> 108[7000] via P2P/IPC +gpub085:1471917:1471987 [1] NCCL INFO Connected all trees +gpub085:1471917:1471987 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub085:1471917:1471987 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub085:1471917:1471987 [1] NCCL INFO comm 0x9704c80 rank 109 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub085:1471918:1471918 [2] NCCL INFO cudaDriverVersion 12010 +gpub085:1471918:1471918 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.185<0> +gpub085:1471918:1471918 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub085:1471918:1471989 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.185<0> +gpub085:1471918:1471989 [2] NCCL INFO Using network IB +gpub085:1471918:1471989 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub085:1471918:1471989 [2] NCCL INFO Trees [0] 111/-1/-1->110->109 [1] 111/-1/-1->110->109 +gpub085:1471918:1471989 [2] NCCL INFO Channel 00/0 : 110[85000] -> 111[c7000] via P2P/IPC +gpub085:1471918:1471989 [2] NCCL INFO Channel 01/0 : 110[85000] -> 111[c7000] via P2P/IPC +gpub085:1471918:1471989 [2] NCCL INFO Connected all rings +gpub085:1471918:1471989 [2] NCCL INFO Channel 00/0 : 110[85000] -> 109[46000] via P2P/IPC +gpub085:1471918:1471989 [2] NCCL INFO Channel 01/0 : 110[85000] -> 109[46000] via P2P/IPC +gpub085:1471918:1471989 [2] NCCL INFO Connected all trees +gpub085:1471918:1471989 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub085:1471918:1471989 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub085:1471918:1471989 [2] NCCL INFO comm 0x4fbd7c40 rank 110 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub085:1471916:1471916 [0] NCCL INFO cudaDriverVersion 12010 +gpub085:1471916:1471916 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.185<0> +gpub085:1471916:1471916 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub085:1471916:1471986 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.185<0> +gpub085:1471916:1471986 [0] NCCL INFO Using network IB +gpub085:1471916:1471986 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub085:1471916:1471986 [0] NCCL INFO Trees [0] 109/-1/-1->108->104 [1] 109/100/-1->108->93 +gpub085:1471916:1471986 [0] NCCL INFO Channel 00/0 : 107[c7000] -> 108[7000] [receive] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 01/0 : 107[c7000] -> 108[7000] [receive] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 00/0 : 108[7000] -> 109[46000] via P2P/IPC +gpub085:1471916:1471986 [0] NCCL INFO Channel 01/0 : 108[7000] -> 109[46000] via P2P/IPC +gpub085:1471916:1471986 [0] NCCL INFO Connected all rings +gpub085:1471916:1471986 [0] NCCL INFO Channel 00/0 : 104[7000] -> 108[7000] [receive] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 01/0 : 100[7000] -> 108[7000] [receive] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 01/0 : 93[46000] -> 108[7000] [receive] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 01/0 : 108[7000] -> 93[46000] [send] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 01/0 : 108[7000] -> 100[7000] [send] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Channel 00/0 : 108[7000] -> 104[7000] [send] via NET/IB/0 +gpub085:1471916:1471986 [0] NCCL INFO Connected all trees +gpub085:1471916:1471986 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub085:1471916:1471986 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub085:1471916:1471986 [0] NCCL INFO comm 0xb90085a0 rank 108 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub068:1244812:1244812 [0] NCCL INFO cudaDriverVersion 12010 +gpub068:1244812:1244812 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.168<0> +gpub068:1244812:1244812 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub068:1244812:1244877 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.168<0> +gpub068:1244812:1244877 [0] NCCL INFO Using network IB +gpub068:1244812:1244877 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub068:1244812:1244877 [0] NCCL INFO Trees [0] 93/-1/-1->92->88 [1] 93/76/-1->92->61 +gpub068:1244812:1244877 [0] NCCL INFO Channel 00/0 : 91[c7000] -> 92[7000] [receive] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 01/0 : 91[c7000] -> 92[7000] [receive] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 00/0 : 92[7000] -> 93[46000] via P2P/IPC +gpub068:1244812:1244877 [0] NCCL INFO Channel 01/0 : 92[7000] -> 93[46000] via P2P/IPC +gpub068:1244812:1244877 [0] NCCL INFO Connected all rings +gpub068:1244812:1244877 [0] NCCL INFO Channel 00/0 : 88[7000] -> 92[7000] [receive] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 01/0 : 76[7000] -> 92[7000] [receive] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 01/0 : 61[46000] -> 92[7000] [receive] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 01/0 : 92[7000] -> 61[46000] [send] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 01/0 : 92[7000] -> 76[7000] [send] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Channel 00/0 : 92[7000] -> 88[7000] [send] via NET/IB/0 +gpub068:1244812:1244877 [0] NCCL INFO Connected all trees +gpub068:1244812:1244877 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub068:1244812:1244877 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub068:1244812:1244877 [0] NCCL INFO comm 0x4fce69e0 rank 92 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub068:1244813:1244813 [1] NCCL INFO cudaDriverVersion 12010 +gpub068:1244813:1244813 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.168<0> +gpub068:1244813:1244813 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub068:1244813:1244876 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.168<0> +gpub068:1244813:1244876 [1] NCCL INFO Using network IB +gpub068:1244813:1244876 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub068:1244813:1244876 [1] NCCL INFO Trees [0] 94/-1/-1->93->92 [1] 94/108/-1->93->92 +gpub068:1244813:1244876 [1] NCCL INFO Channel 00/0 : 93[46000] -> 94[85000] via P2P/IPC +gpub068:1244813:1244876 [1] NCCL INFO Channel 01/0 : 93[46000] -> 94[85000] via P2P/IPC +gpub068:1244813:1244876 [1] NCCL INFO Connected all rings +gpub068:1244813:1244876 [1] NCCL INFO Channel 01/0 : 93[46000] -> 108[7000] [send] via NET/IB/0 +gpub068:1244813:1244876 [1] NCCL INFO Channel 01/0 : 108[7000] -> 93[46000] [receive] via NET/IB/0 +gpub068:1244813:1244876 [1] NCCL INFO Channel 00/0 : 93[46000] -> 92[7000] via P2P/IPC +gpub068:1244813:1244876 [1] NCCL INFO Channel 01/0 : 93[46000] -> 92[7000] via P2P/IPC +gpub068:1244813:1244876 [1] NCCL INFO Connected all trees +gpub068:1244813:1244876 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub068:1244813:1244876 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub068:1244813:1244876 [1] NCCL INFO comm 0x510f96c0 rank 93 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub015:699660:699660 [3] NCCL INFO cudaDriverVersion 12010 +gpub015:699660:699660 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:699660:699660 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:699660:699721 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:699660:699721 [3] NCCL INFO Using network IB +gpub015:699660:699721 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub015:699660:699721 [3] NCCL INFO Trees [0] -1/-1/-1->27->26 [1] -1/-1/-1->27->26 +gpub015:699660:699721 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0 +gpub015:699660:699721 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0 +gpub015:699660:699721 [3] NCCL INFO Connected all rings +gpub015:699660:699721 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 26[85000] via P2P/IPC +gpub015:699660:699721 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 26[85000] via P2P/IPC +gpub015:699660:699721 [3] NCCL INFO Connected all trees +gpub015:699660:699721 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub015:699660:699721 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:699660:699721 [3] NCCL INFO comm 0x4f795830 rank 27 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub085:1471919:1471919 [3] NCCL INFO cudaDriverVersion 12010 +gpub085:1471919:1471919 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.185<0> +gpub085:1471919:1471919 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub085:1471919:1471988 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.185<0> +gpub085:1471919:1471988 [3] NCCL INFO Using network IB +gpub085:1471919:1471988 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub085:1471919:1471988 [3] NCCL INFO Trees [0] -1/-1/-1->111->110 [1] -1/-1/-1->111->110 +gpub085:1471919:1471988 [3] NCCL INFO Channel 00/0 : 111[c7000] -> 112[7000] [send] via NET/IB/0 +gpub085:1471919:1471988 [3] NCCL INFO Channel 01/0 : 111[c7000] -> 112[7000] [send] via NET/IB/0 +gpub085:1471919:1471988 [3] NCCL INFO Connected all rings +gpub085:1471919:1471988 [3] NCCL INFO Channel 00/0 : 111[c7000] -> 110[85000] via P2P/IPC +gpub085:1471919:1471988 [3] NCCL INFO Channel 01/0 : 111[c7000] -> 110[85000] via P2P/IPC +gpub085:1471919:1471988 [3] NCCL INFO Connected all trees +gpub085:1471919:1471988 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub085:1471919:1471988 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub085:1471919:1471988 [3] NCCL INFO comm 0xb79c1300 rank 111 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub041:1218042:1218042 [3] NCCL INFO cudaDriverVersion 12010 +gpub041:1218042:1218042 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.141<0> +gpub041:1218042:1218042 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub041:1218042:1218107 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.141<0> +gpub041:1218042:1218107 [3] NCCL INFO Using network IB +gpub041:1218042:1218107 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub041:1218042:1218107 [3] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] -1/-1/-1->63->62 +gpub041:1218042:1218107 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 64[7000] [send] via NET/IB/0 +gpub041:1218042:1218107 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 64[7000] [send] via NET/IB/0 +gpub041:1218042:1218107 [3] NCCL INFO Connected all rings +gpub041:1218042:1218107 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 62[85000] via P2P/IPC +gpub041:1218042:1218107 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 62[85000] via P2P/IPC +gpub041:1218042:1218107 [3] NCCL INFO Connected all trees +gpub041:1218042:1218107 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub041:1218042:1218107 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub041:1218042:1218107 [3] NCCL INFO comm 0x94448880 rank 63 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub058:1406056:1406056 [0] NCCL INFO cudaDriverVersion 12010 +gpub058:1406056:1406056 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.158<0> +gpub058:1406056:1406056 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub058:1406056:1406118 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.158<0> +gpub058:1406056:1406118 [0] NCCL INFO Using network IB +gpub058:1406056:1406118 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub058:1406056:1406118 [0] NCCL INFO Trees [0] 65/96/-1->64->0 [1] 65/-1/-1->64->68 +gpub058:1406056:1406118 [0] NCCL INFO Channel 00/0 : 63[c7000] -> 64[7000] [receive] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 01/0 : 63[c7000] -> 64[7000] [receive] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 00/0 : 64[7000] -> 65[46000] via P2P/IPC +gpub058:1406056:1406118 [0] NCCL INFO Channel 01/0 : 64[7000] -> 65[46000] via P2P/IPC +gpub058:1406056:1406118 [0] NCCL INFO Connected all rings +gpub058:1406056:1406118 [0] NCCL INFO Channel 01/0 : 64[7000] -> 68[7000] [send] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 00/0 : 64[7000] -> 96[7000] [send] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 00/0 : 0[7000] -> 64[7000] [receive] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 00/0 : 64[7000] -> 0[7000] [send] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 00/0 : 96[7000] -> 64[7000] [receive] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Channel 01/0 : 68[7000] -> 64[7000] [receive] via NET/IB/0 +gpub058:1406056:1406118 [0] NCCL INFO Connected all trees +gpub058:1406056:1406118 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub058:1406056:1406118 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub058:1406056:1406118 [0] NCCL INFO comm 0x9044470 rank 64 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub041:1218040:1218040 [1] NCCL INFO cudaDriverVersion 12010 +gpub041:1218040:1218040 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.141<0> +gpub041:1218040:1218040 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub041:1218040:1218108 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.141<0> +gpub041:1218040:1218108 [1] NCCL INFO Using network IB +gpub041:1218040:1218108 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub041:1218040:1218108 [1] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/92/-1->61->60 +gpub041:1218040:1218108 [1] NCCL INFO Channel 00/0 : 61[46000] -> 62[85000] via P2P/IPC +gpub041:1218040:1218108 [1] NCCL INFO Channel 01/0 : 61[46000] -> 62[85000] via P2P/IPC +gpub041:1218040:1218108 [1] NCCL INFO Connected all rings +gpub041:1218040:1218108 [1] NCCL INFO Channel 01/0 : 61[46000] -> 92[7000] [send] via NET/IB/0 +gpub041:1218040:1218108 [1] NCCL INFO Channel 01/0 : 92[7000] -> 61[46000] [receive] via NET/IB/0 +gpub015:699658:699658 [1] NCCL INFO cudaDriverVersion 12010 +gpub015:699658:699658 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:699658:699658 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:699658:699719 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:699658:699719 [1] NCCL INFO Using network IB +gpub015:699658:699719 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub015:699658:699719 [1] NCCL INFO Trees [0] 26/20/-1->25->24 [1] 26/-1/-1->25->24 +gpub015:699658:699719 [1] NCCL INFO Channel 00/0 : 25[46000] -> 26[85000] via P2P/IPC +gpub015:699658:699719 [1] NCCL INFO Channel 01/0 : 25[46000] -> 26[85000] via P2P/IPC +gpub015:699658:699719 [1] NCCL INFO Connected all rings +gpub015:699658:699719 [1] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [receive] via NET/IB/0 +gpub015:699658:699719 [1] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [send] via NET/IB/0 +gpub015:699658:699719 [1] NCCL INFO Channel 00/0 : 25[46000] -> 24[7000] via P2P/IPC +gpub015:699658:699719 [1] NCCL INFO Channel 01/0 : 25[46000] -> 24[7000] via P2P/IPC +gpub015:699658:699719 [1] NCCL INFO Connected all trees +gpub015:699658:699719 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub015:699658:699719 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:699658:699719 [1] NCCL INFO comm 0x4ff5faa0 rank 25 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub041:1218040:1218108 [1] NCCL INFO Channel 00/0 : 61[46000] -> 60[7000] via P2P/IPC +gpub041:1218040:1218108 [1] NCCL INFO Channel 01/0 : 61[46000] -> 60[7000] via P2P/IPC +gpub041:1218040:1218108 [1] NCCL INFO Connected all trees +gpub041:1218040:1218108 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub041:1218040:1218108 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub041:1218040:1218108 [1] NCCL INFO comm 0x8e9b8f0 rank 61 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub015:699657:699657 [0] NCCL INFO cudaDriverVersion 12010 +gpub015:699657:699657 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:699657:699657 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:699657:699718 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:699657:699718 [0] NCCL INFO Using network IB +gpub015:699657:699718 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub015:699657:699718 [0] NCCL INFO Trees [0] 25/28/-1->24->16 [1] 25/-1/-1->24->21 +gpub015:699657:699718 [0] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 00/0 : 24[7000] -> 25[46000] via P2P/IPC +gpub015:699657:699718 [0] NCCL INFO Channel 01/0 : 24[7000] -> 25[46000] via P2P/IPC +gpub015:699657:699718 [0] NCCL INFO Connected all rings +gpub015:699657:699718 [0] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [receive] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [send] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [receive] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [send] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [receive] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [send] via NET/IB/0 +gpub015:699657:699718 [0] NCCL INFO Connected all trees +gpub015:699657:699718 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub015:699657:699718 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:699657:699718 [0] NCCL INFO comm 0x50bba4a0 rank 24 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub080:3990799:3990799 [0] NCCL INFO cudaDriverVersion 12010 +gpub080:3990799:3990799 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:3990799:3990799 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:3990799:3990868 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:3990799:3990868 [0] NCCL INFO Using network IB +gpub080:3990799:3990868 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub080:3990799:3990868 [0] NCCL INFO Trees [0] 101/-1/-1->100->105 [1] 101/96/-1->100->108 +gpub080:3990799:3990868 [0] NCCL INFO Channel 00/0 : 99[c7000] -> 100[7000] [receive] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 01/0 : 99[c7000] -> 100[7000] [receive] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 00/0 : 100[7000] -> 101[46000] via P2P/IPC +gpub080:3990799:3990868 [0] NCCL INFO Channel 01/0 : 100[7000] -> 101[46000] via P2P/IPC +gpub080:3990799:3990868 [0] NCCL INFO Connected all rings +gpub080:3990799:3990868 [0] NCCL INFO Channel 01/0 : 96[7000] -> 100[7000] [receive] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 00/0 : 100[7000] -> 105[46000] [send] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 01/0 : 100[7000] -> 108[7000] [send] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 01/0 : 108[7000] -> 100[7000] [receive] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 00/0 : 105[46000] -> 100[7000] [receive] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Channel 01/0 : 100[7000] -> 96[7000] [send] via NET/IB/0 +gpub080:3990799:3990868 [0] NCCL INFO Connected all trees +gpub080:3990799:3990868 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub080:3990799:3990868 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:3990799:3990868 [0] NCCL INFO comm 0x8be5af20 rank 100 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub015:699659:699659 [2] NCCL INFO cudaDriverVersion 12010 +gpub015:699659:699659 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.115<0> +gpub015:699659:699659 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub015:699659:699720 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.115<0> +gpub015:699659:699720 [2] NCCL INFO Using network IB +gpub015:699659:699720 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub015:699659:699720 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->25 +gpub015:699659:699720 [2] NCCL INFO Channel 00/0 : 26[85000] -> 27[c7000] via P2P/IPC +gpub015:699659:699720 [2] NCCL INFO Channel 01/0 : 26[85000] -> 27[c7000] via P2P/IPC +gpub015:699659:699720 [2] NCCL INFO Connected all rings +gpub015:699659:699720 [2] NCCL INFO Channel 00/0 : 26[85000] -> 25[46000] via P2P/IPC +gpub015:699659:699720 [2] NCCL INFO Channel 01/0 : 26[85000] -> 25[46000] via P2P/IPC +gpub015:699659:699720 [2] NCCL INFO Connected all trees +gpub015:699659:699720 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub015:699659:699720 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub015:699659:699720 [2] NCCL INFO comm 0x504aba10 rank 26 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub080:3990800:3990800 [1] NCCL INFO cudaDriverVersion 12010 +gpub080:3990800:3990800 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:3990800:3990800 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:3990800:3990871 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:3990800:3990871 [1] NCCL INFO Using network IB +gpub080:3990800:3990871 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub080:3990800:3990871 [1] NCCL INFO Trees [0] 102/-1/-1->101->100 [1] 102/104/-1->101->100 +gpub080:3990800:3990871 [1] NCCL INFO Channel 00/0 : 101[46000] -> 102[85000] via P2P/IPC +gpub080:3990800:3990871 [1] NCCL INFO Channel 01/0 : 101[46000] -> 102[85000] via P2P/IPC +gpub080:3990800:3990871 [1] NCCL INFO Connected all rings +gpub080:3990800:3990871 [1] NCCL INFO Channel 01/0 : 101[46000] -> 104[7000] [send] via NET/IB/0 +gpub080:3990800:3990871 [1] NCCL INFO Channel 01/0 : 104[7000] -> 101[46000] [receive] via NET/IB/0 +gpub080:3990800:3990871 [1] NCCL INFO Channel 00/0 : 101[46000] -> 100[7000] via P2P/IPC +gpub080:3990800:3990871 [1] NCCL INFO Channel 01/0 : 101[46000] -> 100[7000] via P2P/IPC +gpub080:3990800:3990871 [1] NCCL INFO Connected all trees +gpub080:3990800:3990871 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub080:3990800:3990871 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:3990800:3990871 [1] NCCL INFO comm 0xb59713a0 rank 101 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub080:3990801:3990801 [2] NCCL INFO cudaDriverVersion 12010 +gpub080:3990801:3990801 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.180<0> +gpub080:3990801:3990801 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub080:3990801:3990870 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.180<0> +gpub080:3990801:3990870 [2] NCCL INFO Using network IB +gpub080:3990801:3990870 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub080:3990801:3990870 [2] NCCL INFO Trees [0] 103/-1/-1->102->101 [1] 103/-1/-1->102->101 +gpub080:3990801:3990870 [2] NCCL INFO Channel 00/0 : 102[85000] -> 103[c7000] via P2P/IPC +gpub080:3990801:3990870 [2] NCCL INFO Channel 01/0 : 102[85000] -> 103[c7000] via P2P/IPC +gpub080:3990801:3990870 [2] NCCL INFO Connected all rings +gpub080:3990801:3990870 [2] NCCL INFO Channel 00/0 : 102[85000] -> 101[46000] via P2P/IPC +gpub080:3990801:3990870 [2] NCCL INFO Channel 01/0 : 102[85000] -> 101[46000] via P2P/IPC +gpub080:3990801:3990870 [2] NCCL INFO Connected all trees +gpub080:3990801:3990870 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub080:3990801:3990870 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub080:3990801:3990870 [2] NCCL INFO comm 0x52129500 rank 102 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub041:1218039:1218039 [0] NCCL INFO cudaDriverVersion 12010 +gpub041:1218039:1218039 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.141<0> +gpub041:1218039:1218039 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub041:1218039:1218106 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.141<0> +gpub041:1218039:1218106 [0] NCCL INFO Using network IB +gpub041:1218039:1218106 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub041:1218039:1218106 [0] NCCL INFO Trees [0] 61/-1/-1->60->56 [1] 61/28/-1->60->124 +gpub041:1218039:1218106 [0] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 00/0 : 60[7000] -> 61[46000] via P2P/IPC +gpub041:1218039:1218106 [0] NCCL INFO Channel 01/0 : 60[7000] -> 61[46000] via P2P/IPC +gpub041:1218039:1218106 [0] NCCL INFO Connected all rings +gpub041:1218039:1218106 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [receive] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [receive] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 01/0 : 124[7000] -> 60[7000] [receive] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 01/0 : 60[7000] -> 124[7000] [send] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [send] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [send] via NET/IB/0 +gpub041:1218039:1218106 [0] NCCL INFO Connected all trees +gpub041:1218039:1218106 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub041:1218039:1218106 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub041:1218039:1218106 [0] NCCL INFO comm 0x520cee30 rank 60 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub041:1218041:1218041 [2] NCCL INFO cudaDriverVersion 12010 +gpub041:1218041:1218041 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.141<0> +gpub041:1218041:1218041 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub041:1218041:1218109 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.141<0> +gpub041:1218041:1218109 [2] NCCL INFO Using network IB +gpub041:1218041:1218109 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub041:1218041:1218109 [2] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 +gpub041:1218041:1218109 [2] NCCL INFO Channel 00/0 : 62[85000] -> 63[c7000] via P2P/IPC +gpub041:1218041:1218109 [2] NCCL INFO Channel 01/0 : 62[85000] -> 63[c7000] via P2P/IPC +gpub041:1218041:1218109 [2] NCCL INFO Connected all rings +gpub041:1218041:1218109 [2] NCCL INFO Channel 00/0 : 62[85000] -> 61[46000] via P2P/IPC +gpub041:1218041:1218109 [2] NCCL INFO Channel 01/0 : 62[85000] -> 61[46000] via P2P/IPC +gpub041:1218041:1218109 [2] NCCL INFO Connected all trees +gpub041:1218041:1218109 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub041:1218041:1218109 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub041:1218041:1218109 [2] NCCL INFO comm 0x8dfe3560 rank 62 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub035:2421442:2421442 [1] NCCL INFO cudaDriverVersion 12010 +gpub035:2421442:2421442 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.135<0> +gpub035:2421442:2421442 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub035:2421442:2421509 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.135<0> +gpub035:2421442:2421509 [1] NCCL INFO Using network IB +gpub035:2421442:2421509 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub035:2421442:2421509 [1] NCCL INFO Trees [0] 42/36/-1->41->40 [1] 42/-1/-1->41->40 +gpub035:2421442:2421509 [1] NCCL INFO Channel 00/0 : 41[46000] -> 42[85000] via P2P/IPC +gpub035:2421442:2421509 [1] NCCL INFO Channel 01/0 : 41[46000] -> 42[85000] via P2P/IPC +gpub035:2421442:2421509 [1] NCCL INFO Connected all rings +gpub035:2421442:2421509 [1] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [receive] via NET/IB/0 +gpub035:2421442:2421509 [1] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [send] via NET/IB/0 +gpub035:2421442:2421509 [1] NCCL INFO Channel 00/0 : 41[46000] -> 40[7000] via P2P/IPC +gpub035:2421442:2421509 [1] NCCL INFO Channel 01/0 : 41[46000] -> 40[7000] via P2P/IPC +gpub035:2421442:2421509 [1] NCCL INFO Connected all trees +gpub035:2421442:2421509 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub035:2421442:2421509 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub035:2421442:2421509 [1] NCCL INFO comm 0x9bdcb70 rank 41 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub035:2421444:2421444 [3] NCCL INFO cudaDriverVersion 12010 +gpub035:2421444:2421444 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.135<0> +gpub035:2421444:2421444 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub035:2421444:2421508 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.135<0> +gpub035:2421444:2421508 [3] NCCL INFO Using network IB +gpub035:2421444:2421508 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub035:2421444:2421508 [3] NCCL INFO Trees [0] -1/-1/-1->43->42 [1] -1/-1/-1->43->42 +gpub035:2421444:2421508 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0 +gpub035:2421444:2421508 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0 +gpub035:2421444:2421508 [3] NCCL INFO Connected all rings +gpub035:2421444:2421508 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 42[85000] via P2P/IPC +gpub035:2421444:2421508 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 42[85000] via P2P/IPC +gpub035:2421444:2421508 [3] NCCL INFO Connected all trees +gpub035:2421444:2421508 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub035:2421444:2421508 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub035:2421444:2421508 [3] NCCL INFO comm 0x93391d0 rank 43 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub035:2421443:2421443 [2] NCCL INFO cudaDriverVersion 12010 +gpub035:2421443:2421443 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.135<0> +gpub035:2421443:2421443 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub035:2421443:2421507 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.135<0> +gpub035:2421443:2421507 [2] NCCL INFO Using network IB +gpub035:2421443:2421507 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub035:2421443:2421507 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->41 +gpub035:2421443:2421507 [2] NCCL INFO Channel 00/0 : 42[85000] -> 43[c7000] via P2P/IPC +gpub035:2421443:2421507 [2] NCCL INFO Channel 01/0 : 42[85000] -> 43[c7000] via P2P/IPC +gpub035:2421443:2421507 [2] NCCL INFO Connected all rings +gpub035:2421443:2421507 [2] NCCL INFO Channel 00/0 : 42[85000] -> 41[46000] via P2P/IPC +gpub035:2421443:2421507 [2] NCCL INFO Channel 01/0 : 42[85000] -> 41[46000] via P2P/IPC +gpub035:2421443:2421507 [2] NCCL INFO Connected all trees +gpub035:2421443:2421507 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub035:2421443:2421507 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub035:2421443:2421507 [2] NCCL INFO comm 0x91022d0 rank 42 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub016:1262187:1262187 [0] NCCL INFO cudaDriverVersion 12010 +gpub016:1262187:1262187 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1262187:1262187 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1262187:1262261 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1262187:1262261 [0] NCCL INFO Using network IB +gpub016:1262187:1262261 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub016:1262187:1262261 [0] NCCL INFO Trees [0] 29/-1/-1->28->24 [1] 29/12/-1->28->60 +gpub016:1262187:1262261 [0] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 00/0 : 28[7000] -> 29[46000] via P2P/IPC +gpub016:1262187:1262261 [0] NCCL INFO Channel 01/0 : 28[7000] -> 29[46000] via P2P/IPC +gpub016:1262187:1262261 [0] NCCL INFO Connected all rings +gpub016:1262187:1262261 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [receive] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [receive] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [send] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [receive] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [send] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [send] via NET/IB/0 +gpub016:1262187:1262261 [0] NCCL INFO Connected all trees +gpub016:1262187:1262261 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub016:1262187:1262261 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1262187:1262261 [0] NCCL INFO comm 0x2218f980 rank 28 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub016:1262190:1262190 [3] NCCL INFO cudaDriverVersion 12010 +gpub016:1262190:1262190 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1262190:1262190 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1262190:1262260 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1262190:1262260 [3] NCCL INFO Using network IB +gpub016:1262190:1262260 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub016:1262190:1262260 [3] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] -1/-1/-1->31->30 +gpub016:1262190:1262260 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0 +gpub016:1262190:1262260 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0 +gpub016:1262190:1262260 [3] NCCL INFO Connected all rings +gpub016:1262190:1262260 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 30[85000] via P2P/IPC +gpub016:1262190:1262260 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 30[85000] via P2P/IPC +gpub016:1262190:1262260 [3] NCCL INFO Connected all trees +gpub016:1262190:1262260 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub016:1262190:1262260 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1262190:1262260 [3] NCCL INFO comm 0xa7c96f60 rank 31 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub031:1764393:1764393 [0] NCCL INFO cudaDriverVersion 12010 +gpub031:1764393:1764393 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1764393:1764393 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1764393:1764461 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1764393:1764461 [0] NCCL INFO Using network IB +gpub031:1764393:1764461 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub031:1764393:1764461 [0] NCCL INFO Trees [0] 33/48/-1->32->65 [1] 33/-1/-1->32->36 +gpub031:1764393:1764461 [0] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 00/0 : 32[7000] -> 33[46000] via P2P/IPC +gpub031:1764393:1764461 [0] NCCL INFO Channel 01/0 : 32[7000] -> 33[46000] via P2P/IPC +gpub031:1764393:1764461 [0] NCCL INFO Connected all rings +gpub031:1764393:1764461 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [send] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [send] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 00/0 : 32[7000] -> 65[46000] [send] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 00/0 : 65[46000] -> 32[7000] [receive] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [receive] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [receive] via NET/IB/0 +gpub031:1764393:1764461 [0] NCCL INFO Connected all trees +gpub031:1764393:1764461 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub031:1764393:1764461 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1764393:1764461 [0] NCCL INFO comm 0xb6eeceb0 rank 32 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub016:1262189:1262189 [2] NCCL INFO cudaDriverVersion 12010 +gpub016:1262189:1262189 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1262189:1262189 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1262189:1262263 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1262189:1262263 [2] NCCL INFO Using network IB +gpub016:1262189:1262263 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub016:1262189:1262263 [2] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 +gpub016:1262189:1262263 [2] NCCL INFO Channel 00/0 : 30[85000] -> 31[c7000] via P2P/IPC +gpub016:1262189:1262263 [2] NCCL INFO Channel 01/0 : 30[85000] -> 31[c7000] via P2P/IPC +gpub016:1262189:1262263 [2] NCCL INFO Connected all rings +gpub016:1262189:1262263 [2] NCCL INFO Channel 00/0 : 30[85000] -> 29[46000] via P2P/IPC +gpub016:1262189:1262263 [2] NCCL INFO Channel 01/0 : 30[85000] -> 29[46000] via P2P/IPC +gpub016:1262189:1262263 [2] NCCL INFO Connected all trees +gpub016:1262189:1262263 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub016:1262189:1262263 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1262189:1262263 [2] NCCL INFO comm 0x503afdd0 rank 30 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub011:1300883:1300883 [3] NCCL INFO cudaDriverVersion 12010 +gpub011:1300883:1300883 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.111<0> +gpub011:1300883:1300883 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub011:1300883:1300950 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.111<0> +gpub011:1300883:1300950 [3] NCCL INFO Using network IB +gpub011:1300883:1300950 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub011:1300883:1300950 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] -1/-1/-1->11->10 +gpub011:1300883:1300950 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0 +gpub011:1300883:1300950 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0 +gpub011:1300883:1300950 [3] NCCL INFO Connected all rings +gpub011:1300883:1300950 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 10[85000] via P2P/IPC +gpub011:1300883:1300950 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 10[85000] via P2P/IPC +gpub031:1764395:1764395 [2] NCCL INFO cudaDriverVersion 12010 +gpub031:1764395:1764395 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1764395:1764395 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1764395:1764462 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1764395:1764462 [2] NCCL INFO Using network IB +gpub031:1764395:1764462 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub031:1764395:1764462 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/-1/-1->34->33 +gpub031:1764395:1764462 [2] NCCL INFO Channel 00/0 : 34[85000] -> 35[c7000] via P2P/IPC +gpub031:1764395:1764462 [2] NCCL INFO Channel 01/0 : 34[85000] -> 35[c7000] via P2P/IPC +gpub031:1764395:1764462 [2] NCCL INFO Connected all rings +gpub031:1764395:1764462 [2] NCCL INFO Channel 00/0 : 34[85000] -> 33[46000] via P2P/IPC +gpub031:1764395:1764462 [2] NCCL INFO Channel 01/0 : 34[85000] -> 33[46000] via P2P/IPC +gpub011:1300883:1300950 [3] NCCL INFO Connected all trees +gpub011:1300883:1300950 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub011:1300883:1300950 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub011:1300883:1300950 [3] NCCL INFO comm 0xb778d540 rank 11 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub031:1764395:1764462 [2] NCCL INFO Connected all trees +gpub031:1764395:1764462 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub031:1764395:1764462 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1764395:1764462 [2] NCCL INFO comm 0x8d6e3c20 rank 34 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub011:1300880:1300880 [0] NCCL INFO cudaDriverVersion 12010 +gpub011:1300880:1300880 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.111<0> +gpub011:1300880:1300880 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub011:1300880:1300948 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.111<0> +gpub011:1300880:1300948 [0] NCCL INFO Using network IB +gpub011:1300880:1300948 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub011:1300880:1300948 [0] NCCL INFO Trees [0] 9/12/-1->8->17 [1] 9/-1/-1->8->5 +gpub011:1300880:1300948 [0] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 00/0 : 8[7000] -> 9[46000] via P2P/IPC +gpub011:1300880:1300948 [0] NCCL INFO Channel 01/0 : 8[7000] -> 9[46000] via P2P/IPC +gpub011:1300880:1300948 [0] NCCL INFO Connected all rings +gpub011:1300880:1300948 [0] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [receive] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [send] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [send] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [receive] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [receive] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [send] via NET/IB/0 +gpub011:1300880:1300948 [0] NCCL INFO Connected all trees +gpub011:1300880:1300948 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub011:1300880:1300948 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub011:1300880:1300948 [0] NCCL INFO comm 0x50df64a0 rank 8 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub031:1764394:1764394 [1] NCCL INFO cudaDriverVersion 12010 +gpub031:1764394:1764394 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1764394:1764394 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1764394:1764464 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1764394:1764464 [1] NCCL INFO Using network IB +gpub031:1764394:1764464 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub031:1764394:1764464 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] 34/-1/-1->33->32 +gpub031:1764394:1764464 [1] NCCL INFO Channel 00/0 : 33[46000] -> 34[85000] via P2P/IPC +gpub031:1764394:1764464 [1] NCCL INFO Channel 01/0 : 33[46000] -> 34[85000] via P2P/IPC +gpub031:1764394:1764464 [1] NCCL INFO Connected all rings +gpub031:1764394:1764464 [1] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [receive] via NET/IB/0 +gpub031:1764394:1764464 [1] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [send] via NET/IB/0 +gpub031:1764394:1764464 [1] NCCL INFO Channel 00/0 : 33[46000] -> 32[7000] via P2P/IPC +gpub031:1764394:1764464 [1] NCCL INFO Channel 01/0 : 33[46000] -> 32[7000] via P2P/IPC +gpub031:1764394:1764464 [1] NCCL INFO Connected all trees +gpub031:1764394:1764464 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub031:1764394:1764464 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1764394:1764464 [1] NCCL INFO comm 0x514068e0 rank 33 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub011:1300881:1300881 [1] NCCL INFO cudaDriverVersion 12010 +gpub011:1300881:1300881 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.111<0> +gpub011:1300881:1300881 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub011:1300881:1300947 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.111<0> +gpub011:1300881:1300947 [1] NCCL INFO Using network IB +gpub011:1300881:1300947 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub011:1300881:1300947 [1] NCCL INFO Trees [0] 10/4/-1->9->8 [1] 10/-1/-1->9->8 +gpub011:1300881:1300947 [1] NCCL INFO Channel 00/0 : 9[46000] -> 10[85000] via P2P/IPC +gpub011:1300881:1300947 [1] NCCL INFO Channel 01/0 : 9[46000] -> 10[85000] via P2P/IPC +gpub011:1300881:1300947 [1] NCCL INFO Connected all rings +gpub011:1300881:1300947 [1] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [receive] via NET/IB/0 +gpub011:1300881:1300947 [1] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [send] via NET/IB/0 +gpub011:1300881:1300947 [1] NCCL INFO Channel 00/0 : 9[46000] -> 8[7000] via P2P/IPC +gpub011:1300881:1300947 [1] NCCL INFO Channel 01/0 : 9[46000] -> 8[7000] via P2P/IPC +gpub011:1300881:1300947 [1] NCCL INFO Connected all trees +gpub011:1300881:1300947 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub011:1300881:1300947 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub011:1300881:1300947 [1] NCCL INFO comm 0xb0451f0 rank 9 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub031:1764396:1764396 [3] NCCL INFO cudaDriverVersion 12010 +gpub031:1764396:1764396 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.131<0> +gpub031:1764396:1764396 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub031:1764396:1764463 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.131<0> +gpub031:1764396:1764463 [3] NCCL INFO Using network IB +gpub031:1764396:1764463 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub031:1764396:1764463 [3] NCCL INFO Trees [0] -1/-1/-1->35->34 [1] -1/-1/-1->35->34 +gpub031:1764396:1764463 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0 +gpub031:1764396:1764463 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0 +gpub031:1764396:1764463 [3] NCCL INFO Connected all rings +gpub031:1764396:1764463 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 34[85000] via P2P/IPC +gpub031:1764396:1764463 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 34[85000] via P2P/IPC +gpub031:1764396:1764463 [3] NCCL INFO Connected all trees +gpub031:1764396:1764463 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub031:1764396:1764463 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub031:1764396:1764463 [3] NCCL INFO comm 0x9dc35740 rank 35 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub016:1262188:1262188 [1] NCCL INFO cudaDriverVersion 12010 +gpub016:1262188:1262188 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.116<0> +gpub016:1262188:1262188 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub016:1262188:1262262 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.116<0> +gpub016:1262188:1262262 [1] NCCL INFO Using network IB +gpub016:1262188:1262262 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub016:1262188:1262262 [1] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/44/-1->29->28 +gpub016:1262188:1262262 [1] NCCL INFO Channel 00/0 : 29[46000] -> 30[85000] via P2P/IPC +gpub016:1262188:1262262 [1] NCCL INFO Channel 01/0 : 29[46000] -> 30[85000] via P2P/IPC +gpub016:1262188:1262262 [1] NCCL INFO Connected all rings +gpub016:1262188:1262262 [1] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [send] via NET/IB/0 +gpub016:1262188:1262262 [1] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [receive] via NET/IB/0 +gpub016:1262188:1262262 [1] NCCL INFO Channel 00/0 : 29[46000] -> 28[7000] via P2P/IPC +gpub016:1262188:1262262 [1] NCCL INFO Channel 01/0 : 29[46000] -> 28[7000] via P2P/IPC +gpub016:1262188:1262262 [1] NCCL INFO Connected all trees +gpub016:1262188:1262262 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub016:1262188:1262262 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub016:1262188:1262262 [1] NCCL INFO comm 0x8bac38c0 rank 29 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub011:1300882:1300882 [2] NCCL INFO cudaDriverVersion 12010 +gpub011:1300882:1300882 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.111<0> +gpub011:1300882:1300882 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub011:1300882:1300949 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.111<0> +gpub011:1300882:1300949 [2] NCCL INFO Using network IB +gpub011:1300882:1300949 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub011:1300882:1300949 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 +gpub011:1300882:1300949 [2] NCCL INFO Channel 00/0 : 10[85000] -> 11[c7000] via P2P/IPC +gpub011:1300882:1300949 [2] NCCL INFO Channel 01/0 : 10[85000] -> 11[c7000] via P2P/IPC +gpub011:1300882:1300949 [2] NCCL INFO Connected all rings +gpub011:1300882:1300949 [2] NCCL INFO Channel 00/0 : 10[85000] -> 9[46000] via P2P/IPC +gpub011:1300882:1300949 [2] NCCL INFO Channel 01/0 : 10[85000] -> 9[46000] via P2P/IPC +gpub011:1300882:1300949 [2] NCCL INFO Connected all trees +gpub011:1300882:1300949 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub011:1300882:1300949 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub011:1300882:1300949 [2] NCCL INFO comm 0x507d2a60 rank 10 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub040:1881001:1881001 [2] NCCL INFO cudaDriverVersion 12010 +gpub040:1881001:1881001 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.140<0> +gpub040:1881001:1881001 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub040:1881001:1881063 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.140<0> +gpub040:1881001:1881063 [2] NCCL INFO Using network IB +gpub040:1881001:1881063 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub040:1881001:1881063 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->57 +gpub040:1881001:1881063 [2] NCCL INFO Channel 00/0 : 58[85000] -> 59[c7000] via P2P/IPC +gpub040:1881001:1881063 [2] NCCL INFO Channel 01/0 : 58[85000] -> 59[c7000] via P2P/IPC +gpub040:1881001:1881063 [2] NCCL INFO Connected all rings +gpub040:1881001:1881063 [2] NCCL INFO Channel 00/0 : 58[85000] -> 57[46000] via P2P/IPC +gpub040:1881001:1881063 [2] NCCL INFO Channel 01/0 : 58[85000] -> 57[46000] via P2P/IPC +gpub040:1881001:1881063 [2] NCCL INFO Connected all trees +gpub040:1881001:1881063 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub040:1881001:1881063 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub040:1881001:1881063 [2] NCCL INFO comm 0x930e4a0 rank 58 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub066:1330380:1330380 [2] NCCL INFO cudaDriverVersion 12010 +gpub066:1330380:1330380 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1330380:1330380 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1330380:1330441 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1330380:1330441 [2] NCCL INFO Using network IB +gpub066:1330380:1330441 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpub066:1330380:1330441 [2] NCCL INFO Trees [0] 87/-1/-1->86->85 [1] 87/-1/-1->86->85 +gpub066:1330380:1330441 [2] NCCL INFO Channel 00/0 : 86[85000] -> 87[c7000] via P2P/IPC +gpub066:1330380:1330441 [2] NCCL INFO Channel 01/0 : 86[85000] -> 87[c7000] via P2P/IPC +gpub066:1330380:1330441 [2] NCCL INFO Connected all rings +gpub066:1330380:1330441 [2] NCCL INFO Channel 00/0 : 86[85000] -> 85[46000] via P2P/IPC +gpub066:1330380:1330441 [2] NCCL INFO Channel 01/0 : 86[85000] -> 85[46000] via P2P/IPC +gpub066:1330380:1330441 [2] NCCL INFO Connected all trees +gpub066:1330380:1330441 [2] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub066:1330380:1330441 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1330380:1330441 [2] NCCL INFO comm 0xb68349c0 rank 86 nranks 128 cudaDev 2 busId 85000 - Init COMPLETE +gpub040:1881002:1881002 [3] NCCL INFO cudaDriverVersion 12010 +gpub040:1881002:1881002 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.140<0> +gpub040:1881002:1881002 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub040:1881002:1881064 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.140<0> +gpub040:1881002:1881064 [3] NCCL INFO Using network IB +gpub040:1881002:1881064 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub040:1881002:1881064 [3] NCCL INFO Trees [0] -1/-1/-1->59->58 [1] -1/-1/-1->59->58 +gpub040:1881002:1881064 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0 +gpub040:1881002:1881064 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0 +gpub040:1881002:1881064 [3] NCCL INFO Connected all rings +gpub040:1881002:1881064 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 58[85000] via P2P/IPC +gpub040:1881002:1881064 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 58[85000] via P2P/IPC +gpub040:1881002:1881064 [3] NCCL INFO Connected all trees +gpub040:1881002:1881064 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub040:1881002:1881064 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub040:1881002:1881064 [3] NCCL INFO comm 0x8c834280 rank 59 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub040:1880999:1880999 [0] NCCL INFO cudaDriverVersion 12010 +gpub040:1880999:1880999 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.140<0> +gpub040:1880999:1880999 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub040:1880999:1881065 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.140<0> +gpub040:1880999:1881065 [0] NCCL INFO Using network IB +gpub040:1880999:1881065 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub040:1880999:1881065 [0] NCCL INFO Trees [0] 57/60/-1->56->48 [1] 57/-1/-1->56->53 +gpub040:1880999:1881065 [0] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 00/0 : 56[7000] -> 57[46000] via P2P/IPC +gpub040:1880999:1881065 [0] NCCL INFO Channel 01/0 : 56[7000] -> 57[46000] via P2P/IPC +gpub040:1880999:1881065 [0] NCCL INFO Connected all rings +gpub040:1880999:1881065 [0] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [receive] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [send] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [receive] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [send] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [receive] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [send] via NET/IB/0 +gpub040:1880999:1881065 [0] NCCL INFO Connected all trees +gpub040:1880999:1881065 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub040:1880999:1881065 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub040:1880999:1881065 [0] NCCL INFO comm 0x8da18860 rank 56 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub040:1881000:1881000 [1] NCCL INFO cudaDriverVersion 12010 +gpub040:1881000:1881000 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.140<0> +gpub040:1881000:1881000 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub040:1881000:1881066 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.140<0> +gpub040:1881000:1881066 [1] NCCL INFO Using network IB +gpub040:1881000:1881066 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub040:1881000:1881066 [1] NCCL INFO Trees [0] 58/52/-1->57->56 [1] 58/-1/-1->57->56 +gpub040:1881000:1881066 [1] NCCL INFO Channel 00/0 : 57[46000] -> 58[85000] via P2P/IPC +gpub040:1881000:1881066 [1] NCCL INFO Channel 01/0 : 57[46000] -> 58[85000] via P2P/IPC +gpub040:1881000:1881066 [1] NCCL INFO Connected all rings +gpub040:1881000:1881066 [1] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [receive] via NET/IB/0 +gpub040:1881000:1881066 [1] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [send] via NET/IB/0 +gpub040:1881000:1881066 [1] NCCL INFO Channel 00/0 : 57[46000] -> 56[7000] via P2P/IPC +gpub040:1881000:1881066 [1] NCCL INFO Channel 01/0 : 57[46000] -> 56[7000] via P2P/IPC +gpub040:1881000:1881066 [1] NCCL INFO Connected all trees +gpub040:1881000:1881066 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub040:1881000:1881066 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub040:1881000:1881066 [1] NCCL INFO comm 0x9d50600 rank 57 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +gpub066:1330381:1330381 [3] NCCL INFO cudaDriverVersion 12010 +gpub066:1330381:1330381 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1330381:1330381 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1330381:1330442 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1330381:1330442 [3] NCCL INFO Using network IB +gpub066:1330381:1330442 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpub066:1330381:1330442 [3] NCCL INFO Trees [0] -1/-1/-1->87->86 [1] -1/-1/-1->87->86 +gpub066:1330381:1330442 [3] NCCL INFO Channel 00/0 : 87[c7000] -> 88[7000] [send] via NET/IB/0 +gpub066:1330381:1330442 [3] NCCL INFO Channel 01/0 : 87[c7000] -> 88[7000] [send] via NET/IB/0 +gpub066:1330381:1330442 [3] NCCL INFO Connected all rings +gpub066:1330381:1330442 [3] NCCL INFO Channel 00/0 : 87[c7000] -> 86[85000] via P2P/IPC +gpub066:1330381:1330442 [3] NCCL INFO Channel 01/0 : 87[c7000] -> 86[85000] via P2P/IPC +gpub066:1330381:1330442 [3] NCCL INFO Connected all trees +gpub066:1330381:1330442 [3] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub066:1330381:1330442 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1330381:1330442 [3] NCCL INFO comm 0x50ca2de0 rank 87 nranks 128 cudaDev 3 busId c7000 - Init COMPLETE +gpub066:1330378:1330378 [0] NCCL INFO cudaDriverVersion 12010 +gpub066:1330378:1330378 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1330378:1330378 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1330378:1330440 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1330378:1330440 [0] NCCL INFO Using network IB +gpub066:1330378:1330440 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpub066:1330378:1330440 [0] NCCL INFO Trees [0] 85/-1/-1->84->89 [1] 85/80/-1->84->77 +gpub066:1330378:1330440 [0] NCCL INFO Channel 00/0 : 83[c7000] -> 84[7000] [receive] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 01/0 : 83[c7000] -> 84[7000] [receive] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 00/0 : 84[7000] -> 85[46000] via P2P/IPC +gpub066:1330378:1330440 [0] NCCL INFO Channel 01/0 : 84[7000] -> 85[46000] via P2P/IPC +gpub066:1330378:1330440 [0] NCCL INFO Connected all rings +gpub066:1330378:1330440 [0] NCCL INFO Channel 01/0 : 80[7000] -> 84[7000] [receive] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 00/0 : 84[7000] -> 89[46000] [send] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 01/0 : 77[46000] -> 84[7000] [receive] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 01/0 : 84[7000] -> 77[46000] [send] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 00/0 : 89[46000] -> 84[7000] [receive] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Channel 01/0 : 84[7000] -> 80[7000] [send] via NET/IB/0 +gpub066:1330378:1330440 [0] NCCL INFO Connected all trees +gpub066:1330378:1330440 [0] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub066:1330378:1330440 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1330378:1330440 [0] NCCL INFO comm 0x8c91e400 rank 84 nranks 128 cudaDev 0 busId 7000 - Init COMPLETE +gpub066:1330379:1330379 [1] NCCL INFO cudaDriverVersion 12010 +gpub066:1330379:1330379 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.166<0> +gpub066:1330379:1330379 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +gpub066:1330379:1330439 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.166<0> +gpub066:1330379:1330439 [1] NCCL INFO Using network IB +gpub066:1330379:1330439 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpub066:1330379:1330439 [1] NCCL INFO Trees [0] 86/-1/-1->85->84 [1] 86/88/-1->85->84 +gpub066:1330379:1330439 [1] NCCL INFO Channel 00/0 : 85[46000] -> 86[85000] via P2P/IPC +gpub066:1330379:1330439 [1] NCCL INFO Channel 01/0 : 85[46000] -> 86[85000] via P2P/IPC +gpub066:1330379:1330439 [1] NCCL INFO Connected all rings +gpub066:1330379:1330439 [1] NCCL INFO Channel 01/0 : 85[46000] -> 88[7000] [send] via NET/IB/0 +gpub066:1330379:1330439 [1] NCCL INFO Channel 01/0 : 88[7000] -> 85[46000] [receive] via NET/IB/0 +gpub066:1330379:1330439 [1] NCCL INFO Channel 00/0 : 85[46000] -> 84[7000] via P2P/IPC +gpub066:1330379:1330439 [1] NCCL INFO Channel 01/0 : 85[46000] -> 84[7000] via P2P/IPC +gpub066:1330379:1330439 [1] NCCL INFO Connected all trees +gpub066:1330379:1330439 [1] NCCL INFO threadThresholds 8/8/64 | 1024/8/64 | 512 | 512 +gpub066:1330379:1330439 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpub066:1330379:1330439 [1] NCCL INFO comm 0xa42e450 rank 85 nranks 128 cudaDev 1 busId 46000 - Init COMPLETE +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[gpub001:0/128] 2023-07-02 01:48:07,181 (trainer:732) INFO: 1epoch:train:1-100batch: iter_time=1.540, forward_time=0.272, loss_ctc=540.181, loss_att=397.271, acc=0.027, loss=440.144, backward_time=1.098, grad_norm=584.638, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.122, optim0_lr0=1.288e-06, train_time=4.972 +[gpub001:0/128] 2023-07-02 01:50:35,497 (trainer:732) INFO: 1epoch:train:101-200batch: iter_time=1.152e-04, forward_time=0.141, loss_ctc=462.426, loss_att=336.277, acc=0.029, loss=374.121, backward_time=1.077, grad_norm=387.500, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=3.788e-06, train_time=1.483 +[gpub001:0/128] 2023-07-02 01:53:03,465 (trainer:732) INFO: 1epoch:train:201-300batch: iter_time=1.085e-04, forward_time=0.142, loss_ctc=461.451, loss_att=337.298, acc=0.045, loss=374.544, backward_time=1.078, grad_norm=417.405, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=6.287e-06, train_time=1.479 +[gpub001:0/128] 2023-07-02 01:55:31,849 (trainer:732) INFO: 1epoch:train:301-400batch: iter_time=1.042e-04, forward_time=0.143, loss_ctc=342.808, loss_att=285.551, acc=0.064, loss=302.728, backward_time=1.080, grad_norm=643.089, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=8.788e-06, train_time=1.484 +[gpub001:0/128] 2023-07-02 01:55:42,501 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/128] 2023-07-02 01:56:04,392 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 01:56:08,967 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 01:56:08,967 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/128] 2023-07-02 01:56:08,971 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 02:02:36,435 (trainer:732) INFO: 1epoch:train:401-500batch: iter_time=1.606, forward_time=0.145, loss_ctc=338.668, loss_att=312.612, acc=0.065, loss=320.429, backward_time=1.102, grad_norm=722.907, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=1.129e-05, train_time=4.246 +[gpub001:0/128] 2023-07-02 02:05:05,101 (trainer:732) INFO: 1epoch:train:501-600batch: iter_time=9.538e-05, forward_time=0.144, loss_ctc=290.690, loss_att=280.674, acc=0.090, loss=283.679, backward_time=1.078, grad_norm=573.866, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=1.379e-05, train_time=1.486 +[gpub001:0/128] 2023-07-02 02:07:33,618 (trainer:732) INFO: 1epoch:train:601-700batch: iter_time=9.612e-05, forward_time=0.143, loss_ctc=302.811, loss_att=317.380, acc=0.118, loss=313.009, backward_time=1.077, grad_norm=525.059, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=1.629e-05, train_time=1.485 +[gpub001:0/128] 2023-07-02 02:10:02,081 (trainer:732) INFO: 1epoch:train:701-800batch: iter_time=9.252e-05, forward_time=0.143, loss_ctc=263.886, loss_att=258.073, acc=0.153, loss=259.817, backward_time=1.077, grad_norm=413.463, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=1.879e-05, train_time=1.484 +[gpub001:0/128] 2023-07-02 02:10:03,693 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/128] 2023-07-02 02:10:25,856 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 02:10:30,152 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 02:10:30,152 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/128] 2023-07-02 02:10:30,156 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 02:18:38,072 (trainer:732) INFO: 1epoch:train:801-900batch: iter_time=1.499, forward_time=0.145, loss_ctc=297.783, loss_att=277.425, acc=0.155, loss=283.533, backward_time=1.202, grad_norm=432.564, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=2.129e-05, train_time=5.160 +[gpub001:0/128] 2023-07-02 02:21:09,704 (trainer:732) INFO: 1epoch:train:901-1000batch: iter_time=9.771e-05, forward_time=0.144, loss_ctc=274.105, loss_att=250.004, acc=0.168, loss=257.234, backward_time=1.078, grad_norm=408.850, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=2.379e-05, train_time=1.516 +[gpub001:0/128] 2023-07-02 02:23:38,330 (trainer:732) INFO: 1epoch:train:1001-1100batch: iter_time=1.074e-04, forward_time=0.143, loss_ctc=293.450, loss_att=288.752, acc=0.156, loss=290.161, backward_time=1.078, grad_norm=433.379, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=2.629e-05, train_time=1.486 +[gpub001:0/128] 2023-07-02 02:26:07,352 (trainer:732) INFO: 1epoch:train:1101-1200batch: iter_time=9.141e-05, forward_time=0.144, loss_ctc=254.103, loss_att=237.304, acc=0.175, loss=242.344, backward_time=1.079, grad_norm=378.541, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=2.879e-05, train_time=1.490 +[gpub001:0/128] 2023-07-02 02:26:09,180 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/128] 2023-07-02 02:26:31,466 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 02:26:35,764 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 02:26:35,764 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/128] 2023-07-02 02:26:35,768 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 02:33:43,766 (trainer:732) INFO: 1epoch:train:1201-1300batch: iter_time=1.566, forward_time=0.182, loss_ctc=284.394, loss_att=260.556, acc=0.170, loss=267.707, backward_time=1.096, grad_norm=319.756, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.123, optim0_lr0=3.129e-05, train_time=4.564 +[gpub001:0/128] 2023-07-02 02:36:17,719 (trainer:732) INFO: 1epoch:train:1301-1400batch: iter_time=1.084e-04, forward_time=0.145, loss_ctc=264.134, loss_att=232.279, acc=0.185, loss=241.835, backward_time=1.082, grad_norm=322.456, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=3.379e-05, train_time=1.539 +[gpub001:0/128] 2023-07-02 02:38:56,271 (trainer:732) INFO: 1epoch:train:1401-1500batch: iter_time=9.806e-05, forward_time=0.145, loss_ctc=283.768, loss_att=254.155, acc=0.174, loss=263.039, backward_time=1.085, grad_norm=323.685, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=3.629e-05, train_time=1.585 +[gpub001:0/128] 2023-07-02 02:41:27,198 (trainer:732) INFO: 1epoch:train:1501-1600batch: iter_time=9.429e-05, forward_time=0.145, loss_ctc=248.736, loss_att=223.280, acc=0.187, loss=230.917, backward_time=1.079, grad_norm=295.086, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=3.879e-05, train_time=1.509 +[gpub001:0/128] 2023-07-02 02:41:41,817 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/128] 2023-07-02 02:42:03,909 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 02:42:08,170 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 02:42:08,170 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/128] 2023-07-02 02:42:08,184 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 02:47:44,894 (trainer:732) INFO: 1epoch:train:1601-1700batch: iter_time=2.210, forward_time=0.168, loss_ctc=277.204, loss_att=247.225, acc=0.188, loss=256.219, backward_time=1.098, grad_norm=299.571, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=4.129e-05, train_time=3.776 +[gpub001:0/128] 2023-07-02 02:50:21,446 (trainer:732) INFO: 1epoch:train:1701-1800batch: iter_time=8.222e-05, forward_time=0.144, loss_ctc=256.731, loss_att=222.419, acc=0.198, loss=232.713, backward_time=1.097, grad_norm=275.648, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=4.379e-05, train_time=1.566 +[gpub001:0/128] 2023-07-02 02:53:02,939 (trainer:732) INFO: 1epoch:train:1801-1900batch: iter_time=8.187e-05, forward_time=0.144, loss_ctc=274.581, loss_att=254.361, acc=0.182, loss=260.427, backward_time=1.114, grad_norm=313.056, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=4.629e-05, train_time=1.615 +[gpub001:0/128] 2023-07-02 02:55:42,853 (trainer:732) INFO: 1epoch:train:1901-2000batch: iter_time=8.359e-05, forward_time=0.143, loss_ctc=236.100, loss_att=214.394, acc=0.200, loss=220.906, backward_time=1.087, grad_norm=255.461, clip=100.000, loss_scale=6.554e+04, optim_step_time=0.121, optim0_lr0=4.879e-05, train_time=1.599 +[gpub001:0/128] 2023-07-02 02:55:53,991 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/128] 2023-07-02 02:56:16,340 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 02:56:20,644 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 02:56:20,644 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/128] 2023-07-02 02:56:20,648 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 03:02:12,674 (trainer:732) INFO: 1epoch:train:2001-2100batch: iter_time=1.648, forward_time=0.145, loss_ctc=255.392, loss_att=234.231, acc=0.196, loss=240.580, backward_time=1.102, grad_norm=254.801, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=5.129e-05, train_time=3.898 +[gpub001:0/128] 2023-07-02 03:04:41,291 (trainer:732) INFO: 1epoch:train:2101-2200batch: iter_time=8.832e-05, forward_time=0.145, loss_ctc=228.533, loss_att=210.197, acc=0.210, loss=215.698, backward_time=1.080, grad_norm=265.408, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=5.379e-05, train_time=1.486 +[gpub001:0/128] 2023-07-02 03:07:09,624 (trainer:732) INFO: 1epoch:train:2201-2300batch: iter_time=9.138e-05, forward_time=0.144, loss_ctc=231.701, loss_att=230.702, acc=0.198, loss=231.002, backward_time=1.079, grad_norm=283.955, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=5.629e-05, train_time=1.483 +[gpub001:0/128] 2023-07-02 03:09:49,400 (trainer:732) INFO: 1epoch:train:2301-2400batch: iter_time=8.965e-05, forward_time=0.144, loss_ctc=196.101, loss_att=202.117, acc=0.215, loss=200.312, backward_time=1.099, grad_norm=248.405, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=5.879e-05, train_time=1.598 +[gpub001:0/128] 2023-07-02 03:09:51,010 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/128] 2023-07-02 03:10:13,711 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 03:10:18,017 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 03:10:18,018 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/128] 2023-07-02 03:10:18,021 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 03:18:03,936 (trainer:732) INFO: 1epoch:train:2401-2500batch: iter_time=1.551, forward_time=0.146, loss_ctc=209.517, loss_att=221.916, acc=0.228, loss=218.197, backward_time=1.101, grad_norm=277.157, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=6.129e-05, train_time=4.945 +[gpub001:0/128] 2023-07-02 03:20:32,902 (trainer:732) INFO: 1epoch:train:2501-2600batch: iter_time=1.010e-04, forward_time=0.146, loss_ctc=197.334, loss_att=192.816, acc=0.252, loss=194.171, backward_time=1.080, grad_norm=292.260, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=6.379e-05, train_time=1.489 +[gpub001:0/128] 2023-07-02 03:23:26,427 (trainer:732) INFO: 1epoch:train:2601-2700batch: iter_time=1.025e-04, forward_time=0.145, loss_ctc=194.768, loss_att=218.914, acc=0.251, loss=211.670, backward_time=1.099, grad_norm=328.740, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=6.629e-05, train_time=1.735 +[gpub001:0/128] 2023-07-02 03:25:58,159 (trainer:732) INFO: 1epoch:train:2701-2800batch: iter_time=9.939e-05, forward_time=0.146, loss_ctc=167.623, loss_att=168.951, acc=0.295, loss=168.553, backward_time=1.081, grad_norm=241.927, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=6.879e-05, train_time=1.517 +[gpub001:0/128] 2023-07-02 03:26:07,547 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/128] 2023-07-02 03:26:29,813 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 03:26:34,386 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 03:26:34,386 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/128] 2023-07-02 03:26:34,390 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 03:31:56,946 (trainer:732) INFO: 1epoch:train:2801-2900batch: iter_time=1.619, forward_time=0.172, loss_ctc=176.463, loss_att=177.295, acc=0.315, loss=177.046, backward_time=1.103, grad_norm=201.912, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.122, optim0_lr0=7.129e-05, train_time=3.588 +[gpub001:0/128] 2023-07-02 03:34:26,090 (trainer:732) INFO: 1epoch:train:2901-3000batch: iter_time=8.798e-05, forward_time=0.144, loss_ctc=169.381, loss_att=157.379, acc=0.328, loss=160.979, backward_time=1.077, grad_norm=221.600, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=7.379e-05, train_time=1.491 +[gpub001:0/128] 2023-07-02 03:36:55,616 (trainer:732) INFO: 1epoch:train:3001-3100batch: iter_time=9.043e-05, forward_time=0.146, loss_ctc=164.817, loss_att=190.673, acc=0.310, loss=182.916, backward_time=1.082, grad_norm=242.341, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=7.629e-05, train_time=1.495 +[gpub001:0/128] 2023-07-02 03:39:26,502 (trainer:732) INFO: 1epoch:train:3101-3200batch: iter_time=9.068e-05, forward_time=0.144, loss_ctc=152.565, loss_att=148.213, acc=0.340, loss=149.518, backward_time=1.083, grad_norm=231.027, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=7.879e-05, train_time=1.509 +[gpub001:0/128] 2023-07-02 03:39:30,891 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/128] 2023-07-02 03:39:53,618 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 03:39:57,929 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 03:39:57,929 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/128] 2023-07-02 03:39:57,932 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 03:47:03,652 (trainer:732) INFO: 1epoch:train:3201-3300batch: iter_time=2.099, forward_time=0.144, loss_ctc=155.601, loss_att=164.815, acc=0.345, loss=162.051, backward_time=1.111, grad_norm=210.374, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=8.129e-05, train_time=4.571 +[gpub001:0/128] 2023-07-02 03:49:32,184 (trainer:732) INFO: 1epoch:train:3301-3400batch: iter_time=1.183e-04, forward_time=0.144, loss_ctc=152.491, loss_att=143.312, acc=0.360, loss=146.066, backward_time=1.076, grad_norm=205.020, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=8.379e-05, train_time=1.486 +[gpub001:0/128] 2023-07-02 03:52:01,611 (trainer:732) INFO: 1epoch:train:3401-3500batch: iter_time=1.060e-04, forward_time=0.145, loss_ctc=148.864, loss_att=168.528, acc=0.335, loss=162.629, backward_time=1.078, grad_norm=210.197, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=8.629e-05, train_time=1.494 +[gpub001:0/128] 2023-07-02 03:54:31,233 (trainer:732) INFO: 1epoch:train:3501-3600batch: iter_time=9.133e-05, forward_time=0.145, loss_ctc=135.953, loss_att=134.611, acc=0.371, loss=135.014, backward_time=1.079, grad_norm=179.182, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=8.879e-05, train_time=1.496 +[gpub001:0/128] 2023-07-02 03:54:40,963 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/128] 2023-07-02 03:55:03,510 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 03:55:07,768 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 03:55:07,768 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/128] 2023-07-02 03:55:07,772 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 04:02:55,539 (trainer:732) INFO: 1epoch:train:3601-3700batch: iter_time=1.634, forward_time=0.172, loss_ctc=141.664, loss_att=149.355, acc=0.379, loss=147.048, backward_time=1.096, grad_norm=213.922, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=9.129e-05, train_time=5.043 +[gpub001:0/128] 2023-07-02 04:05:24,077 (trainer:732) INFO: 1epoch:train:3701-3800batch: iter_time=1.136e-04, forward_time=0.145, loss_ctc=140.225, loss_att=130.232, acc=0.390, loss=133.230, backward_time=1.077, grad_norm=217.100, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=9.379e-05, train_time=1.485 +[gpub001:0/128] 2023-07-02 04:07:53,148 (trainer:732) INFO: 1epoch:train:3801-3900batch: iter_time=1.102e-04, forward_time=0.144, loss_ctc=137.165, loss_att=157.511, acc=0.355, loss=151.407, backward_time=1.077, grad_norm=196.286, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=9.629e-05, train_time=1.490 +[gpub001:0/128] 2023-07-02 04:10:39,053 (trainer:732) INFO: 1epoch:train:3901-4000batch: iter_time=9.469e-05, forward_time=0.144, loss_ctc=124.160, loss_att=126.918, acc=0.390, loss=126.091, backward_time=1.090, grad_norm=241.832, clip=100.000, loss_scale=1.311e+05, optim_step_time=0.121, optim0_lr0=9.879e-05, train_time=1.659 +[gpub001:0/128] 2023-07-02 04:20:41,655 (trainer:338) INFO: 1epoch results: [train] iter_time=0.424, forward_time=0.150, loss_ctc=243.208, loss_att=225.399, acc=0.220, loss=230.742, backward_time=1.090, grad_norm=327.236, clip=100.000, loss_scale=9.830e+04, optim_step_time=0.121, optim0_lr0=5.004e-05, train_time=2.262, time=2 hours, 31 minutes and 2.4 seconds, total_count=4000, gpu_max_cached_mem_GB=33.912, [valid] loss_ctc=122.230, cer_ctc=0.606, loss_att=129.277, acc=0.275, cer=0.670, wer=1.000, loss=127.163, time=3 minutes and 57.44 seconds, total_count=506, gpu_max_cached_mem_GB=37.207, [att_plot] time=5 minutes and 51.9 seconds, total_count=0, gpu_max_cached_mem_GB=37.207 +[gpub001:0/128] 2023-07-02 04:20:57,442 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub001:0/128] 2023-07-02 04:20:57,442 (trainer:272) INFO: 2/100epoch started. Estimated time to finish: 1 week, 4 days and 1 hour +[gpub001:0/128] 2023-07-02 04:20:57,445 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/128] 2023-07-02 04:21:19,393 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 04:21:23,457 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 04:21:23,458 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/128] 2023-07-02 04:21:23,461 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 04:26:11,570 (trainer:732) INFO: 2epoch:train:1-100batch: iter_time=1.523, forward_time=0.165, loss_ctc=141.462, loss_att=147.694, acc=0.365, loss=145.825, backward_time=1.111, grad_norm=180.767, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.122, optim0_lr0=1.013e-04, train_time=3.141 +[gpub001:0/128] 2023-07-02 04:28:54,092 (trainer:732) INFO: 2epoch:train:101-200batch: iter_time=1.010e-04, forward_time=0.146, loss_ctc=127.915, loss_att=134.951, acc=0.409, loss=132.840, backward_time=1.103, grad_norm=154.665, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.038e-04, train_time=1.625 +[gpub001:0/128] 2023-07-02 04:31:31,556 (trainer:732) INFO: 2epoch:train:201-300batch: iter_time=9.197e-05, forward_time=0.200, loss_ctc=125.885, loss_att=122.785, acc=0.379, loss=123.715, backward_time=1.092, grad_norm=178.909, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.124, optim0_lr0=1.063e-04, train_time=1.574 +[gpub001:0/128] 2023-07-02 04:34:11,293 (trainer:732) INFO: 2epoch:train:301-400batch: iter_time=9.427e-05, forward_time=0.147, loss_ctc=121.684, loss_att=132.872, acc=0.377, loss=129.516, backward_time=1.105, grad_norm=197.046, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.088e-04, train_time=1.597 +[gpub001:0/128] 2023-07-02 04:34:19,159 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/128] 2023-07-02 04:34:40,978 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 04:34:45,187 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 04:34:45,187 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/128] 2023-07-02 04:34:45,191 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 04:42:26,425 (trainer:732) INFO: 2epoch:train:401-500batch: iter_time=2.464, forward_time=0.183, loss_ctc=133.533, loss_att=141.745, acc=0.391, loss=139.281, backward_time=1.100, grad_norm=177.034, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.122, optim0_lr0=1.113e-04, train_time=4.951 +[gpub001:0/128] 2023-07-02 04:44:57,986 (trainer:732) INFO: 2epoch:train:501-600batch: iter_time=1.026e-04, forward_time=0.146, loss_ctc=117.575, loss_att=129.798, acc=0.436, loss=126.131, backward_time=1.087, grad_norm=157.194, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.138e-04, train_time=1.516 +[gpub001:0/128] 2023-07-02 04:47:30,329 (trainer:732) INFO: 2epoch:train:601-700batch: iter_time=1.003e-04, forward_time=0.146, loss_ctc=114.442, loss_att=114.728, acc=0.414, loss=114.642, backward_time=1.082, grad_norm=169.206, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.163e-04, train_time=1.523 +[gpub001:0/128] 2023-07-02 04:50:12,389 (trainer:732) INFO: 2epoch:train:701-800batch: iter_time=9.729e-05, forward_time=0.145, loss_ctc=117.962, loss_att=130.095, acc=0.398, loss=126.455, backward_time=1.107, grad_norm=202.411, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.188e-04, train_time=1.620 +[gpub001:0/128] 2023-07-02 04:50:19,407 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/128] 2023-07-02 04:50:41,623 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 04:50:45,945 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 04:50:45,945 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/128] 2023-07-02 04:50:45,949 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 04:58:45,325 (trainer:732) INFO: 2epoch:train:801-900batch: iter_time=1.604, forward_time=0.186, loss_ctc=126.010, loss_att=132.556, acc=0.414, loss=130.592, backward_time=1.105, grad_norm=175.243, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.213e-04, train_time=5.129 +[gpub001:0/128] 2023-07-02 05:01:17,736 (trainer:732) INFO: 2epoch:train:901-1000batch: iter_time=9.383e-05, forward_time=0.149, loss_ctc=109.712, loss_att=118.287, acc=0.466, loss=115.715, backward_time=1.088, grad_norm=142.149, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.238e-04, train_time=1.524 +[gpub001:0/128] 2023-07-02 05:03:47,575 (trainer:732) INFO: 2epoch:train:1001-1100batch: iter_time=1.014e-04, forward_time=0.147, loss_ctc=110.053, loss_att=106.480, acc=0.439, loss=107.552, backward_time=1.086, grad_norm=153.951, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.263e-04, train_time=1.498 +[gpub001:0/128] 2023-07-02 05:06:16,384 (trainer:732) INFO: 2epoch:train:1101-1200batch: iter_time=9.267e-05, forward_time=0.147, loss_ctc=107.375, loss_att=120.439, acc=0.425, loss=116.520, backward_time=1.082, grad_norm=193.914, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.288e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 05:06:27,229 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/128] 2023-07-02 05:06:49,654 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 05:06:54,182 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 05:06:54,182 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/128] 2023-07-02 05:06:54,186 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 05:13:53,395 (trainer:732) INFO: 2epoch:train:1201-1300batch: iter_time=2.083, forward_time=0.188, loss_ctc=119.429, loss_att=122.178, acc=0.442, loss=121.353, backward_time=1.161, grad_norm=211.966, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.122, optim0_lr0=1.313e-04, train_time=4.569 +[gpub001:0/128] 2023-07-02 05:16:51,682 (trainer:732) INFO: 2epoch:train:1301-1400batch: iter_time=8.730e-05, forward_time=0.148, loss_ctc=105.735, loss_att=111.683, acc=0.485, loss=109.899, backward_time=1.179, grad_norm=138.234, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.338e-04, train_time=1.784 +[gpub001:0/128] 2023-07-02 05:19:35,325 (trainer:732) INFO: 2epoch:train:1401-1500batch: iter_time=9.301e-05, forward_time=0.145, loss_ctc=104.633, loss_att=99.671, acc=0.460, loss=101.159, backward_time=1.106, grad_norm=170.937, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.363e-04, train_time=1.636 +[gpub001:0/128] 2023-07-02 05:22:31,703 (trainer:732) INFO: 2epoch:train:1501-1600batch: iter_time=9.055e-05, forward_time=0.146, loss_ctc=102.772, loss_att=110.998, acc=0.455, loss=108.530, backward_time=1.109, grad_norm=160.641, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.388e-04, train_time=1.764 +[gpub001:0/128] 2023-07-02 05:22:33,982 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/128] 2023-07-02 05:22:56,271 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 05:23:00,511 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 05:23:00,511 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/128] 2023-07-02 05:23:00,515 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 05:29:35,230 (trainer:732) INFO: 2epoch:train:1601-1700batch: iter_time=1.551, forward_time=0.149, loss_ctc=113.295, loss_att=115.104, acc=0.459, loss=114.561, backward_time=1.108, grad_norm=155.653, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.413e-04, train_time=4.235 +[gpub001:0/128] 2023-07-02 05:32:13,472 (trainer:732) INFO: 2epoch:train:1701-1800batch: iter_time=1.052e-04, forward_time=0.146, loss_ctc=103.800, loss_att=106.712, acc=0.499, loss=105.838, backward_time=1.096, grad_norm=148.694, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.438e-04, train_time=1.582 +[gpub001:0/128] 2023-07-02 05:34:44,526 (trainer:732) INFO: 2epoch:train:1801-1900batch: iter_time=1.072e-04, forward_time=0.147, loss_ctc=101.364, loss_att=92.579, acc=0.479, loss=95.215, backward_time=1.082, grad_norm=163.286, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.463e-04, train_time=1.510 +[gpub001:0/128] 2023-07-02 05:37:18,235 (trainer:732) INFO: 2epoch:train:1901-2000batch: iter_time=9.932e-05, forward_time=0.146, loss_ctc=102.646, loss_att=106.142, acc=0.463, loss=105.093, backward_time=1.095, grad_norm=182.571, clip=100.000, loss_scale=2.621e+05, optim_step_time=0.121, optim0_lr0=1.488e-04, train_time=1.537 +[gpub001:0/128] 2023-07-02 05:37:19,899 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/128] 2023-07-02 05:37:42,251 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 05:37:46,475 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 05:37:46,475 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/128] 2023-07-02 05:37:46,479 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 05:48:10,631 (trainer:732) INFO: 2epoch:train:2001-2100batch: iter_time=1.530, forward_time=0.146, loss_ctc=116.459, loss_att=113.836, acc=0.467, loss=114.623, backward_time=1.101, grad_norm=190.683, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.513e-04, train_time=6.524 +[gpub001:0/128] 2023-07-02 05:50:53,948 (trainer:732) INFO: 2epoch:train:2101-2200batch: iter_time=9.630e-05, forward_time=0.149, loss_ctc=98.615, loss_att=98.760, acc=0.520, loss=98.716, backward_time=1.108, grad_norm=133.124, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.538e-04, train_time=1.633 +[gpub001:0/128] 2023-07-02 05:53:22,527 (trainer:732) INFO: 2epoch:train:2201-2300batch: iter_time=1.008e-04, forward_time=0.146, loss_ctc=96.354, loss_att=87.441, acc=0.497, loss=90.115, backward_time=1.080, grad_norm=137.996, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.563e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 05:55:51,147 (trainer:732) INFO: 2epoch:train:2301-2400batch: iter_time=9.314e-05, forward_time=0.147, loss_ctc=97.507, loss_att=99.440, acc=0.485, loss=98.860, backward_time=1.081, grad_norm=204.800, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.588e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 05:55:53,085 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/128] 2023-07-02 05:56:15,243 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 05:56:19,560 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 05:56:19,560 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/128] 2023-07-02 05:56:19,564 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 06:03:08,333 (trainer:732) INFO: 2epoch:train:2401-2500batch: iter_time=1.611, forward_time=0.178, loss_ctc=108.028, loss_att=109.475, acc=0.488, loss=109.041, backward_time=1.104, grad_norm=160.868, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.122, optim0_lr0=1.613e-04, train_time=4.371 +[gpub001:0/128] 2023-07-02 06:05:38,294 (trainer:732) INFO: 2epoch:train:2501-2600batch: iter_time=9.944e-05, forward_time=0.147, loss_ctc=100.030, loss_att=100.384, acc=0.532, loss=100.278, backward_time=1.086, grad_norm=166.489, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.638e-04, train_time=1.500 +[gpub001:0/128] 2023-07-02 06:08:07,446 (trainer:732) INFO: 2epoch:train:2601-2700batch: iter_time=1.010e-04, forward_time=0.147, loss_ctc=95.719, loss_att=86.792, acc=0.515, loss=89.470, backward_time=1.082, grad_norm=137.126, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.663e-04, train_time=1.491 +[gpub001:0/128] 2023-07-02 06:10:36,538 (trainer:732) INFO: 2epoch:train:2701-2800batch: iter_time=1.025e-04, forward_time=0.148, loss_ctc=95.055, loss_att=99.392, acc=0.503, loss=98.091, backward_time=1.084, grad_norm=141.816, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.688e-04, train_time=1.491 +[gpub001:0/128] 2023-07-02 06:10:46,481 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/128] 2023-07-02 06:11:08,639 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 06:11:12,893 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 06:11:12,893 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/128] 2023-07-02 06:11:12,897 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 06:18:12,047 (trainer:732) INFO: 2epoch:train:2801-2900batch: iter_time=2.408, forward_time=0.148, loss_ctc=105.117, loss_att=104.174, acc=0.506, loss=104.457, backward_time=1.103, grad_norm=133.346, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.713e-04, train_time=4.555 +[gpub001:0/128] 2023-07-02 06:20:42,419 (trainer:732) INFO: 2epoch:train:2901-3000batch: iter_time=1.216e-04, forward_time=0.145, loss_ctc=97.400, loss_att=94.758, acc=0.546, loss=95.551, backward_time=1.086, grad_norm=188.040, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.738e-04, train_time=1.504 +[gpub001:0/128] 2023-07-02 06:23:24,163 (trainer:732) INFO: 2epoch:train:3001-3100batch: iter_time=1.230e-04, forward_time=0.145, loss_ctc=98.565, loss_att=87.186, acc=0.519, loss=90.599, backward_time=1.106, grad_norm=160.472, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.763e-04, train_time=1.617 +[gpub001:0/128] 2023-07-02 06:25:58,596 (trainer:732) INFO: 2epoch:train:3101-3200batch: iter_time=9.116e-05, forward_time=0.146, loss_ctc=94.556, loss_att=97.305, acc=0.507, loss=96.480, backward_time=1.088, grad_norm=152.535, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.788e-04, train_time=1.544 +[gpub001:0/128] 2023-07-02 06:26:00,561 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/128] 2023-07-02 06:26:23,005 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 06:26:27,267 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 06:26:27,267 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/128] 2023-07-02 06:26:27,271 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 06:33:58,814 (trainer:732) INFO: 2epoch:train:3201-3300batch: iter_time=1.523, forward_time=0.163, loss_ctc=105.148, loss_att=101.685, acc=0.514, loss=102.724, backward_time=1.128, grad_norm=146.660, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.122, optim0_lr0=1.813e-04, train_time=4.802 +[gpub001:0/128] 2023-07-02 06:36:43,188 (trainer:732) INFO: 2epoch:train:3301-3400batch: iter_time=8.705e-05, forward_time=0.148, loss_ctc=93.198, loss_att=91.092, acc=0.554, loss=91.724, backward_time=1.095, grad_norm=154.320, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.838e-04, train_time=1.644 +[gpub001:0/128] 2023-07-02 06:39:35,381 (trainer:732) INFO: 2epoch:train:3401-3500batch: iter_time=9.148e-05, forward_time=0.145, loss_ctc=93.879, loss_att=82.609, acc=0.528, loss=85.990, backward_time=1.097, grad_norm=142.568, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.863e-04, train_time=1.722 +[gpub001:0/128] 2023-07-02 06:42:04,168 (trainer:732) INFO: 2epoch:train:3501-3600batch: iter_time=8.728e-05, forward_time=0.146, loss_ctc=91.983, loss_att=91.456, acc=0.522, loss=91.614, backward_time=1.081, grad_norm=151.785, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.888e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 06:42:05,883 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/128] 2023-07-02 06:42:28,495 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 06:42:32,765 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 06:42:32,765 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/128] 2023-07-02 06:42:32,769 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 06:48:18,823 (trainer:732) INFO: 2epoch:train:3601-3700batch: iter_time=1.599, forward_time=0.170, loss_ctc=105.844, loss_att=98.496, acc=0.521, loss=100.700, backward_time=1.106, grad_norm=150.808, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.913e-04, train_time=3.746 +[gpub001:0/128] 2023-07-02 06:50:50,784 (trainer:732) INFO: 2epoch:train:3701-3800batch: iter_time=9.644e-05, forward_time=0.146, loss_ctc=94.459, loss_att=88.622, acc=0.558, loss=90.373, backward_time=1.084, grad_norm=127.971, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.938e-04, train_time=1.520 +[gpub001:0/128] 2023-07-02 06:53:19,863 (trainer:732) INFO: 2epoch:train:3801-3900batch: iter_time=9.916e-05, forward_time=0.146, loss_ctc=90.685, loss_att=77.556, acc=0.547, loss=81.494, backward_time=1.082, grad_norm=162.079, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.963e-04, train_time=1.491 +[gpub001:0/128] 2023-07-02 06:55:49,487 (trainer:732) INFO: 2epoch:train:3901-4000batch: iter_time=9.854e-05, forward_time=0.145, loss_ctc=93.244, loss_att=90.302, acc=0.527, loss=91.184, backward_time=1.080, grad_norm=140.916, clip=100.000, loss_scale=5.243e+05, optim_step_time=0.121, optim0_lr0=1.988e-04, train_time=1.496 +[gpub001:0/128] 2023-07-02 07:05:45,651 (trainer:338) INFO: 2epoch results: [train] iter_time=0.448, forward_time=0.153, loss_ctc=106.978, loss_att=107.457, acc=0.475, loss=107.313, backward_time=1.099, grad_norm=162.472, clip=100.000, loss_scale=3.932e+05, optim_step_time=0.121, optim0_lr0=1.500e-04, train_time=2.323, time=2 hours, 35 minutes and 3.24 seconds, total_count=8000, gpu_max_cached_mem_GB=37.209, [valid] loss_ctc=97.384, cer_ctc=0.435, loss_att=89.451, acc=0.424, cer=0.559, wer=1.000, loss=91.831, time=3 minutes and 54.37 seconds, total_count=1012, gpu_max_cached_mem_GB=37.209, [att_plot] time=5 minutes and 50.55 seconds, total_count=0, gpu_max_cached_mem_GB=37.209 +[gpub001:0/128] 2023-07-02 07:06:05,363 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub001:0/128] 2023-07-02 07:06:05,432 (trainer:272) INFO: 3/100epoch started. Estimated time to finish: 1 week, 4 days and 2 hours +[gpub001:0/128] 2023-07-02 07:06:06,952 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/128] 2023-07-02 07:06:28,662 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 07:06:35,113 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 07:06:35,113 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/128] 2023-07-02 07:06:35,236 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 07:14:50,669 (trainer:732) INFO: 3epoch:train:1-100batch: iter_time=3.597, forward_time=0.184, loss_ctc=99.267, loss_att=94.412, acc=0.525, loss=95.868, backward_time=1.110, grad_norm=176.962, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.123, optim0_lr0=2.013e-04, train_time=5.245 +[gpub001:0/128] 2023-07-02 07:17:25,258 (trainer:732) INFO: 3epoch:train:101-200batch: iter_time=9.299e-05, forward_time=0.157, loss_ctc=113.344, loss_att=105.667, acc=0.511, loss=107.970, backward_time=1.096, grad_norm=200.073, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.038e-04, train_time=1.546 +[gpub001:0/128] 2023-07-02 07:20:08,475 (trainer:732) INFO: 3epoch:train:201-300batch: iter_time=9.056e-05, forward_time=0.183, loss_ctc=101.734, loss_att=111.837, acc=0.507, loss=108.806, backward_time=1.108, grad_norm=163.340, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.123, optim0_lr0=2.063e-04, train_time=1.632 +[gpub001:0/128] 2023-07-02 07:22:51,493 (trainer:732) INFO: 3epoch:train:301-400batch: iter_time=8.434e-05, forward_time=0.235, loss_ctc=104.299, loss_att=105.729, acc=0.534, loss=105.300, backward_time=1.113, grad_norm=189.640, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.125, optim0_lr0=2.088e-04, train_time=1.630 +[gpub001:0/128] 2023-07-02 07:23:00,414 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/128] 2023-07-02 07:23:22,231 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 07:23:26,426 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 07:23:26,426 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/128] 2023-07-02 07:23:26,430 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 07:29:39,256 (trainer:732) INFO: 3epoch:train:401-500batch: iter_time=1.897, forward_time=0.147, loss_ctc=97.563, loss_att=88.949, acc=0.535, loss=91.534, backward_time=1.113, grad_norm=154.738, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.113e-04, train_time=4.077 +[gpub001:0/128] 2023-07-02 07:32:09,205 (trainer:732) INFO: 3epoch:train:501-600batch: iter_time=9.931e-05, forward_time=0.146, loss_ctc=105.782, loss_att=97.985, acc=0.527, loss=100.324, backward_time=1.085, grad_norm=177.959, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.138e-04, train_time=1.499 +[gpub001:0/128] 2023-07-02 07:34:38,498 (trainer:732) INFO: 3epoch:train:601-700batch: iter_time=1.042e-04, forward_time=0.147, loss_ctc=99.455, loss_att=104.887, acc=0.524, loss=103.257, backward_time=1.082, grad_norm=128.974, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.163e-04, train_time=1.493 +[gpub001:0/128] 2023-07-02 07:37:07,052 (trainer:732) INFO: 3epoch:train:701-800batch: iter_time=9.803e-05, forward_time=0.146, loss_ctc=98.270, loss_att=100.056, acc=0.545, loss=99.520, backward_time=1.081, grad_norm=161.994, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.188e-04, train_time=1.485 +[gpub001:0/128] 2023-07-02 07:37:09,329 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/128] 2023-07-02 07:37:31,341 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 07:37:35,616 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 07:37:35,616 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/128] 2023-07-02 07:37:35,620 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 07:45:33,041 (trainer:732) INFO: 3epoch:train:801-900batch: iter_time=1.497, forward_time=0.145, loss_ctc=93.991, loss_att=84.725, acc=0.550, loss=87.505, backward_time=1.122, grad_norm=141.107, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.213e-04, train_time=5.060 +[gpub001:0/128] 2023-07-02 07:48:06,099 (trainer:732) INFO: 3epoch:train:901-1000batch: iter_time=9.286e-05, forward_time=0.146, loss_ctc=104.802, loss_att=93.877, acc=0.536, loss=97.154, backward_time=1.088, grad_norm=185.883, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.238e-04, train_time=1.530 +[gpub001:0/128] 2023-07-02 07:50:42,566 (trainer:732) INFO: 3epoch:train:1001-1100batch: iter_time=9.304e-05, forward_time=0.145, loss_ctc=98.120, loss_att=102.667, acc=0.530, loss=101.303, backward_time=1.094, grad_norm=167.675, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.263e-04, train_time=1.564 +[gpub001:0/128] 2023-07-02 07:53:11,245 (trainer:732) INFO: 3epoch:train:1101-1200batch: iter_time=8.833e-05, forward_time=0.144, loss_ctc=100.707, loss_att=99.351, acc=0.551, loss=99.758, backward_time=1.081, grad_norm=199.649, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.288e-04, train_time=1.487 +[gpub001:0/128] 2023-07-02 07:53:12,995 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/128] 2023-07-02 07:53:35,401 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 07:53:39,690 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 07:53:39,690 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/128] 2023-07-02 07:53:39,694 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 07:59:30,939 (trainer:732) INFO: 3epoch:train:1201-1300batch: iter_time=1.559, forward_time=0.174, loss_ctc=94.186, loss_att=87.170, acc=0.539, loss=89.275, backward_time=1.106, grad_norm=328.801, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.313e-04, train_time=3.796 +[gpub001:0/128] 2023-07-02 08:02:05,406 (trainer:732) INFO: 3epoch:train:1301-1400batch: iter_time=1.010e-04, forward_time=0.146, loss_ctc=99.263, loss_att=93.429, acc=0.537, loss=95.179, backward_time=1.098, grad_norm=148.362, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.338e-04, train_time=1.545 +[gpub001:0/128] 2023-07-02 08:04:34,756 (trainer:732) INFO: 3epoch:train:1401-1500batch: iter_time=1.014e-04, forward_time=0.146, loss_ctc=94.767, loss_att=98.356, acc=0.536, loss=97.279, backward_time=1.083, grad_norm=130.626, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.363e-04, train_time=1.493 +[gpub001:0/128] 2023-07-02 08:07:05,149 (trainer:732) INFO: 3epoch:train:1501-1600batch: iter_time=1.019e-04, forward_time=0.145, loss_ctc=100.064, loss_att=98.786, acc=0.540, loss=99.170, backward_time=1.089, grad_norm=163.999, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.388e-04, train_time=1.504 +[gpub001:0/128] 2023-07-02 08:07:16,192 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/128] 2023-07-02 08:07:38,844 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 08:07:43,426 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 08:07:43,426 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/128] 2023-07-02 08:07:43,431 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 08:15:07,803 (trainer:732) INFO: 3epoch:train:1601-1700batch: iter_time=2.782, forward_time=0.171, loss_ctc=91.055, loss_att=81.218, acc=0.556, loss=84.169, backward_time=1.110, grad_norm=128.434, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.122, optim0_lr0=2.413e-04, train_time=4.826 +[gpub001:0/128] 2023-07-02 08:17:37,685 (trainer:732) INFO: 3epoch:train:1701-1800batch: iter_time=1.161e-04, forward_time=0.149, loss_ctc=98.061, loss_att=89.586, acc=0.548, loss=92.128, backward_time=1.082, grad_norm=146.044, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.438e-04, train_time=1.499 +[gpub001:0/128] 2023-07-02 08:20:09,607 (trainer:732) INFO: 3epoch:train:1801-1900batch: iter_time=1.026e-04, forward_time=0.146, loss_ctc=98.893, loss_att=99.335, acc=0.537, loss=99.202, backward_time=1.081, grad_norm=155.831, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.463e-04, train_time=1.519 +[gpub001:0/128] 2023-07-02 08:22:38,993 (trainer:732) INFO: 3epoch:train:1901-2000batch: iter_time=1.062e-04, forward_time=0.147, loss_ctc=97.226, loss_att=98.121, acc=0.542, loss=97.853, backward_time=1.080, grad_norm=188.439, clip=100.000, loss_scale=1.049e+06, optim_step_time=0.121, optim0_lr0=2.488e-04, train_time=1.494 +[gpub001:0/128] 2023-07-02 08:22:52,777 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/128] 2023-07-02 08:23:14,999 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 08:23:19,324 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 08:23:19,324 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/128] 2023-07-02 08:23:19,328 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 08:30:03,463 (trainer:732) INFO: 3epoch:train:2001-2100batch: iter_time=2.802, forward_time=0.146, loss_ctc=91.586, loss_att=82.817, acc=0.563, loss=85.448, backward_time=1.105, grad_norm=140.635, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.494e-04, train_time=4.444 +[gpub001:0/128] 2023-07-02 08:32:33,043 (trainer:732) INFO: 3epoch:train:2101-2200batch: iter_time=1.024e-04, forward_time=0.147, loss_ctc=98.382, loss_att=88.675, acc=0.561, loss=91.587, backward_time=1.083, grad_norm=161.427, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.481e-04, train_time=1.496 +[gpub001:0/128] 2023-07-02 08:35:02,072 (trainer:732) INFO: 3epoch:train:2201-2300batch: iter_time=1.103e-04, forward_time=0.146, loss_ctc=94.521, loss_att=95.214, acc=0.555, loss=95.006, backward_time=1.083, grad_norm=120.258, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.469e-04, train_time=1.490 +[gpub001:0/128] 2023-07-02 08:37:38,878 (trainer:732) INFO: 3epoch:train:2301-2400batch: iter_time=1.005e-04, forward_time=0.146, loss_ctc=95.535, loss_att=91.789, acc=0.581, loss=92.913, backward_time=1.089, grad_norm=130.439, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.457e-04, train_time=1.568 +[gpub001:0/128] 2023-07-02 08:37:40,534 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/128] 2023-07-02 08:38:03,105 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 08:38:07,445 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 08:38:07,445 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/128] 2023-07-02 08:38:07,449 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 08:44:55,772 (trainer:732) INFO: 3epoch:train:2401-2500batch: iter_time=1.576, forward_time=0.176, loss_ctc=88.566, loss_att=78.855, acc=0.571, loss=81.769, backward_time=1.099, grad_norm=125.630, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.122, optim0_lr0=2.445e-04, train_time=4.368 +[gpub001:0/128] 2023-07-02 08:47:25,999 (trainer:732) INFO: 3epoch:train:2501-2600batch: iter_time=1.039e-04, forward_time=0.146, loss_ctc=98.401, loss_att=88.608, acc=0.556, loss=91.546, backward_time=1.085, grad_norm=149.550, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.434e-04, train_time=1.502 +[gpub001:0/128] 2023-07-02 08:49:54,806 (trainer:732) INFO: 3epoch:train:2601-2700batch: iter_time=9.939e-05, forward_time=0.146, loss_ctc=92.927, loss_att=93.438, acc=0.553, loss=93.285, backward_time=1.081, grad_norm=122.391, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.422e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 08:52:23,625 (trainer:732) INFO: 3epoch:train:2701-2800batch: iter_time=9.931e-05, forward_time=0.146, loss_ctc=92.326, loss_att=92.323, acc=0.561, loss=92.324, backward_time=1.081, grad_norm=125.602, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.411e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 08:52:26,455 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/128] 2023-07-02 08:52:48,903 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 08:52:53,114 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 08:52:53,114 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/128] 2023-07-02 08:52:53,118 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 08:59:50,440 (trainer:732) INFO: 3epoch:train:2801-2900batch: iter_time=1.553, forward_time=0.146, loss_ctc=93.235, loss_att=78.888, acc=0.571, loss=83.192, backward_time=1.117, grad_norm=128.780, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.400e-04, train_time=4.468 +[gpub001:0/128] 2023-07-02 09:02:28,341 (trainer:732) INFO: 3epoch:train:2901-3000batch: iter_time=1.081e-04, forward_time=0.147, loss_ctc=96.098, loss_att=83.970, acc=0.569, loss=87.609, backward_time=1.098, grad_norm=147.013, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.389e-04, train_time=1.579 +[gpub001:0/128] 2023-07-02 09:04:57,396 (trainer:732) INFO: 3epoch:train:3001-3100batch: iter_time=9.911e-05, forward_time=0.145, loss_ctc=91.842, loss_att=91.034, acc=0.560, loss=91.276, backward_time=1.081, grad_norm=132.321, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.378e-04, train_time=1.490 +[gpub001:0/128] 2023-07-02 09:07:27,650 (trainer:732) INFO: 3epoch:train:3101-3200batch: iter_time=9.178e-05, forward_time=0.145, loss_ctc=92.726, loss_att=90.434, acc=0.567, loss=91.122, backward_time=1.083, grad_norm=123.287, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.367e-04, train_time=1.502 +[gpub001:0/128] 2023-07-02 09:07:37,004 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/128] 2023-07-02 09:07:59,537 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 09:08:03,842 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 09:08:03,842 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/128] 2023-07-02 09:08:03,846 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 09:15:19,332 (trainer:732) INFO: 3epoch:train:3201-3300batch: iter_time=1.643, forward_time=0.155, loss_ctc=86.892, loss_att=74.836, acc=0.582, loss=78.453, backward_time=1.104, grad_norm=119.753, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.357e-04, train_time=4.717 +[gpub001:0/128] 2023-07-02 09:17:49,867 (trainer:732) INFO: 3epoch:train:3301-3400batch: iter_time=9.086e-05, forward_time=0.147, loss_ctc=94.866, loss_att=80.655, acc=0.580, loss=84.918, backward_time=1.083, grad_norm=133.801, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.346e-04, train_time=1.505 +[gpub001:0/128] 2023-07-02 09:20:19,435 (trainer:732) INFO: 3epoch:train:3401-3500batch: iter_time=9.423e-05, forward_time=0.145, loss_ctc=91.040, loss_att=90.233, acc=0.562, loss=90.475, backward_time=1.080, grad_norm=122.931, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.336e-04, train_time=1.495 +[gpub001:0/128] 2023-07-02 09:22:47,882 (trainer:732) INFO: 3epoch:train:3501-3600batch: iter_time=9.105e-05, forward_time=0.145, loss_ctc=90.252, loss_att=86.651, acc=0.579, loss=87.731, backward_time=1.080, grad_norm=128.843, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.326e-04, train_time=1.484 +[gpub001:0/128] 2023-07-02 09:22:49,908 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/128] 2023-07-02 09:23:12,184 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 09:23:16,491 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 09:23:16,491 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/128] 2023-07-02 09:23:16,495 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 09:29:14,252 (trainer:732) INFO: 3epoch:train:3601-3700batch: iter_time=2.007, forward_time=0.173, loss_ctc=86.671, loss_att=74.898, acc=0.593, loss=78.429, backward_time=1.111, grad_norm=109.421, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.122, optim0_lr0=2.316e-04, train_time=3.863 +[gpub001:0/128] 2023-07-02 09:31:55,295 (trainer:732) INFO: 3epoch:train:3701-3800batch: iter_time=1.082e-04, forward_time=0.146, loss_ctc=95.586, loss_att=81.639, acc=0.588, loss=85.823, backward_time=1.093, grad_norm=144.015, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.306e-04, train_time=1.610 +[gpub001:0/128] 2023-07-02 09:34:33,919 (trainer:732) INFO: 3epoch:train:3801-3900batch: iter_time=8.169e-05, forward_time=0.145, loss_ctc=88.969, loss_att=87.806, acc=0.582, loss=88.155, backward_time=1.090, grad_norm=123.158, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.296e-04, train_time=1.586 +[gpub001:0/128] 2023-07-02 09:37:16,204 (trainer:732) INFO: 3epoch:train:3901-4000batch: iter_time=7.505e-05, forward_time=0.146, loss_ctc=90.182, loss_att=85.377, acc=0.604, loss=86.819, backward_time=1.099, grad_norm=111.405, clip=100.000, loss_scale=2.097e+06, optim_step_time=0.121, optim0_lr0=2.287e-04, train_time=1.623 +[gpub001:0/128] 2023-07-02 09:46:42,576 (trainer:338) INFO: 3epoch results: [train] iter_time=0.523, forward_time=0.153, loss_ctc=96.036, loss_att=91.357, acc=0.554, loss=92.761, backward_time=1.093, grad_norm=150.980, clip=100.000, loss_scale=1.573e+06, optim_step_time=0.121, optim0_lr0=2.318e-04, train_time=2.267, time=2 hours, 31 minutes and 22.78 seconds, total_count=12000, gpu_max_cached_mem_GB=37.209, [valid] loss_ctc=94.999, cer_ctc=0.408, loss_att=80.741, acc=0.476, cer=0.524, wer=1.000, loss=85.019, time=3 minutes and 26.69 seconds, total_count=1518, gpu_max_cached_mem_GB=37.209, [att_plot] time=5 minutes and 47.46 seconds, total_count=0, gpu_max_cached_mem_GB=37.209 +[gpub001:0/128] 2023-07-02 09:46:57,926 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub001:0/128] 2023-07-02 09:46:57,928 (trainer:272) INFO: 4/100epoch started. Estimated time to finish: 1 week, 3 days and 22 hours +[gpub001:0/128] 2023-07-02 09:46:57,931 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/128] 2023-07-02 09:47:20,132 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 09:47:24,739 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 09:47:24,739 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/128] 2023-07-02 09:47:24,743 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 09:52:16,542 (trainer:732) INFO: 4epoch:train:1-100batch: iter_time=1.603, forward_time=0.147, loss_ctc=98.611, loss_att=79.913, acc=0.566, loss=85.523, backward_time=1.107, grad_norm=126.977, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.277e-04, train_time=3.186 +[gpub001:0/128] 2023-07-02 09:54:48,383 (trainer:732) INFO: 4epoch:train:101-200batch: iter_time=9.905e-05, forward_time=0.146, loss_ctc=93.292, loss_att=81.088, acc=0.588, loss=84.749, backward_time=1.087, grad_norm=140.335, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.268e-04, train_time=1.518 +[gpub001:0/128] 2023-07-02 09:57:28,030 (trainer:732) INFO: 4epoch:train:201-300batch: iter_time=1.015e-04, forward_time=0.144, loss_ctc=87.246, loss_att=71.450, acc=0.583, loss=76.189, backward_time=1.089, grad_norm=114.444, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.259e-04, train_time=1.596 +[gpub001:0/128] 2023-07-02 10:00:28,551 (trainer:732) INFO: 4epoch:train:301-400batch: iter_time=1.028e-04, forward_time=0.265, loss_ctc=98.629, loss_att=87.747, acc=0.596, loss=91.012, backward_time=1.121, grad_norm=129.662, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.124, optim0_lr0=2.249e-04, train_time=1.805 +[gpub001:0/128] 2023-07-02 10:00:31,903 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/128] 2023-07-02 10:00:53,880 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 10:00:58,136 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 10:00:58,136 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/128] 2023-07-02 10:00:58,163 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 10:07:30,385 (trainer:732) INFO: 4epoch:train:401-500batch: iter_time=1.924, forward_time=0.160, loss_ctc=93.934, loss_att=78.095, acc=0.565, loss=82.847, backward_time=1.116, grad_norm=124.073, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.122, optim0_lr0=2.240e-04, train_time=4.218 +[gpub001:0/128] 2023-07-02 10:10:00,005 (trainer:732) INFO: 4epoch:train:501-600batch: iter_time=9.793e-05, forward_time=0.145, loss_ctc=95.139, loss_att=82.950, acc=0.579, loss=86.606, backward_time=1.084, grad_norm=150.418, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.231e-04, train_time=1.496 +[gpub001:0/128] 2023-07-02 10:12:28,946 (trainer:732) INFO: 4epoch:train:601-700batch: iter_time=9.726e-05, forward_time=0.145, loss_ctc=81.329, loss_att=68.312, acc=0.587, loss=72.217, backward_time=1.082, grad_norm=104.574, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.223e-04, train_time=1.489 +[gpub001:0/128] 2023-07-02 10:14:57,804 (trainer:732) INFO: 4epoch:train:701-800batch: iter_time=9.243e-05, forward_time=0.145, loss_ctc=98.008, loss_att=90.442, acc=0.585, loss=92.712, backward_time=1.081, grad_norm=138.562, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.214e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 10:15:11,245 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/128] 2023-07-02 10:15:34,137 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 10:15:38,468 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 10:15:38,468 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/128] 2023-07-02 10:15:38,472 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 10:20:48,872 (trainer:732) INFO: 4epoch:train:801-900batch: iter_time=1.797, forward_time=0.161, loss_ctc=94.104, loss_att=75.650, acc=0.575, loss=81.186, backward_time=1.104, grad_norm=142.424, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.205e-04, train_time=3.510 +[gpub001:0/128] 2023-07-02 10:23:20,613 (trainer:732) INFO: 4epoch:train:901-1000batch: iter_time=9.979e-05, forward_time=0.145, loss_ctc=91.794, loss_att=79.748, acc=0.589, loss=83.362, backward_time=1.095, grad_norm=112.707, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.197e-04, train_time=1.517 +[gpub001:0/128] 2023-07-02 10:25:49,266 (trainer:732) INFO: 4epoch:train:1001-1100batch: iter_time=1.043e-04, forward_time=0.145, loss_ctc=80.587, loss_att=68.929, acc=0.594, loss=72.427, backward_time=1.079, grad_norm=117.919, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.188e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 10:28:17,736 (trainer:732) INFO: 4epoch:train:1101-1200batch: iter_time=9.663e-05, forward_time=0.145, loss_ctc=96.761, loss_att=88.378, acc=0.589, loss=90.893, backward_time=1.081, grad_norm=118.353, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.180e-04, train_time=1.484 +[gpub001:0/128] 2023-07-02 10:28:19,612 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/128] 2023-07-02 10:28:41,496 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 10:28:45,961 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 10:28:45,961 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/128] 2023-07-02 10:28:45,965 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 10:34:43,152 (trainer:732) INFO: 4epoch:train:1201-1300batch: iter_time=1.606, forward_time=0.181, loss_ctc=91.554, loss_att=74.234, acc=0.591, loss=79.430, backward_time=1.106, grad_norm=113.931, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.172e-04, train_time=3.854 +[gpub001:0/128] 2023-07-02 10:37:13,480 (trainer:732) INFO: 4epoch:train:1301-1400batch: iter_time=7.940e-05, forward_time=0.146, loss_ctc=89.134, loss_att=77.040, acc=0.606, loss=80.668, backward_time=1.084, grad_norm=106.282, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.164e-04, train_time=1.503 +[gpub001:0/128] 2023-07-02 10:39:44,808 (trainer:732) INFO: 4epoch:train:1401-1500batch: iter_time=8.522e-05, forward_time=0.146, loss_ctc=78.916, loss_att=63.992, acc=0.613, loss=68.469, backward_time=1.088, grad_norm=96.651, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.156e-04, train_time=1.513 +[gpub001:0/128] 2023-07-02 10:42:13,272 (trainer:732) INFO: 4epoch:train:1501-1600batch: iter_time=7.945e-05, forward_time=0.145, loss_ctc=96.481, loss_att=83.388, acc=0.614, loss=87.316, backward_time=1.081, grad_norm=147.415, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.148e-04, train_time=1.484 +[gpub001:0/128] 2023-07-02 10:42:23,348 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/128] 2023-07-02 10:42:45,873 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 10:42:50,180 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 10:42:50,180 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/128] 2023-07-02 10:42:50,184 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 10:49:13,034 (trainer:732) INFO: 4epoch:train:1601-1700batch: iter_time=1.974, forward_time=0.173, loss_ctc=88.077, loss_att=72.109, acc=0.591, loss=76.900, backward_time=1.098, grad_norm=115.526, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.140e-04, train_time=4.197 +[gpub001:0/128] 2023-07-02 10:51:42,877 (trainer:732) INFO: 4epoch:train:1701-1800batch: iter_time=7.686e-05, forward_time=0.145, loss_ctc=88.207, loss_att=74.904, acc=0.604, loss=78.895, backward_time=1.083, grad_norm=104.521, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.132e-04, train_time=1.498 +[gpub001:0/128] 2023-07-02 10:54:14,248 (trainer:732) INFO: 4epoch:train:1801-1900batch: iter_time=7.721e-05, forward_time=0.145, loss_ctc=77.966, loss_att=64.135, acc=0.608, loss=68.284, backward_time=1.084, grad_norm=108.621, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.124e-04, train_time=1.513 +[gpub001:0/128] 2023-07-02 10:56:42,821 (trainer:732) INFO: 4epoch:train:1901-2000batch: iter_time=7.336e-05, forward_time=0.144, loss_ctc=93.915, loss_att=85.705, acc=0.603, loss=88.168, backward_time=1.082, grad_norm=116.908, clip=100.000, loss_scale=4.194e+06, optim_step_time=0.121, optim0_lr0=2.117e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 10:56:44,596 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/128] 2023-07-02 10:57:06,859 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 10:57:11,122 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 10:57:11,123 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/128] 2023-07-02 10:57:11,126 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 11:04:53,694 (trainer:732) INFO: 4epoch:train:2001-2100batch: iter_time=1.699, forward_time=0.148, loss_ctc=88.188, loss_att=72.107, acc=0.591, loss=76.931, backward_time=1.102, grad_norm=116.420, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.109e-04, train_time=4.909 +[gpub001:0/128] 2023-07-02 11:07:24,536 (trainer:732) INFO: 4epoch:train:2101-2200batch: iter_time=1.028e-04, forward_time=0.148, loss_ctc=87.151, loss_att=74.177, acc=0.610, loss=78.069, backward_time=1.083, grad_norm=105.729, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.102e-04, train_time=1.508 +[gpub001:0/128] 2023-07-02 11:09:53,133 (trainer:732) INFO: 4epoch:train:2201-2300batch: iter_time=9.744e-05, forward_time=0.147, loss_ctc=79.577, loss_att=65.200, acc=0.609, loss=69.513, backward_time=1.080, grad_norm=100.328, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.094e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 11:12:23,324 (trainer:732) INFO: 4epoch:train:2301-2400batch: iter_time=9.173e-05, forward_time=0.147, loss_ctc=92.968, loss_att=84.240, acc=0.606, loss=86.858, backward_time=1.081, grad_norm=109.427, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.087e-04, train_time=1.502 +[gpub001:0/128] 2023-07-02 11:12:25,037 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/128] 2023-07-02 11:12:47,470 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 11:12:51,797 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 11:12:51,797 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/128] 2023-07-02 11:12:51,801 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 11:19:12,113 (trainer:732) INFO: 4epoch:train:2401-2500batch: iter_time=1.546, forward_time=0.177, loss_ctc=87.759, loss_att=70.861, acc=0.599, loss=75.930, backward_time=1.115, grad_norm=136.550, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.122, optim0_lr0=2.080e-04, train_time=4.087 +[gpub001:0/128] 2023-07-02 11:21:41,041 (trainer:732) INFO: 4epoch:train:2501-2600batch: iter_time=7.723e-05, forward_time=0.145, loss_ctc=86.882, loss_att=73.524, acc=0.613, loss=77.532, backward_time=1.082, grad_norm=106.358, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.072e-04, train_time=1.489 +[gpub001:0/128] 2023-07-02 11:24:18,226 (trainer:732) INFO: 4epoch:train:2601-2700batch: iter_time=8.031e-05, forward_time=0.144, loss_ctc=78.351, loss_att=64.279, acc=0.611, loss=68.501, backward_time=1.102, grad_norm=109.443, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.065e-04, train_time=1.572 +[gpub001:0/128] 2023-07-02 11:27:00,239 (trainer:732) INFO: 4epoch:train:2701-2800batch: iter_time=8.117e-05, forward_time=0.144, loss_ctc=93.410, loss_att=83.593, acc=0.610, loss=86.538, backward_time=1.102, grad_norm=114.063, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.058e-04, train_time=1.620 +[gpub001:0/128] 2023-07-02 11:27:05,797 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/128] 2023-07-02 11:27:27,858 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 11:27:32,128 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 11:27:32,128 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/128] 2023-07-02 11:27:32,132 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 11:34:05,899 (trainer:732) INFO: 4epoch:train:2801-2900batch: iter_time=2.044, forward_time=0.173, loss_ctc=87.698, loss_att=72.670, acc=0.606, loss=77.179, backward_time=1.104, grad_norm=115.150, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.051e-04, train_time=4.256 +[gpub001:0/128] 2023-07-02 11:36:36,429 (trainer:732) INFO: 4epoch:train:2901-3000batch: iter_time=9.052e-05, forward_time=0.147, loss_ctc=86.042, loss_att=73.284, acc=0.624, loss=77.111, backward_time=1.087, grad_norm=107.460, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.045e-04, train_time=1.505 +[gpub001:0/128] 2023-07-02 11:39:05,178 (trainer:732) INFO: 4epoch:train:3001-3100batch: iter_time=7.764e-05, forward_time=0.145, loss_ctc=76.124, loss_att=61.174, acc=0.628, loss=65.659, backward_time=1.081, grad_norm=99.575, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.038e-04, train_time=1.487 +[gpub001:0/128] 2023-07-02 11:41:33,800 (trainer:732) INFO: 4epoch:train:3101-3200batch: iter_time=7.370e-05, forward_time=0.144, loss_ctc=90.784, loss_att=79.546, acc=0.630, loss=82.917, backward_time=1.082, grad_norm=105.692, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.031e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 11:41:41,143 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/128] 2023-07-02 11:42:03,545 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 11:42:07,889 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 11:42:07,889 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/128] 2023-07-02 11:42:07,893 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 11:48:55,570 (trainer:732) INFO: 4epoch:train:3201-3300batch: iter_time=2.460, forward_time=0.170, loss_ctc=86.826, loss_att=70.191, acc=0.612, loss=75.182, backward_time=1.105, grad_norm=122.938, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.024e-04, train_time=4.417 +[gpub001:0/128] 2023-07-02 11:51:25,881 (trainer:732) INFO: 4epoch:train:3301-3400batch: iter_time=9.428e-05, forward_time=0.147, loss_ctc=85.421, loss_att=72.776, acc=0.625, loss=76.569, backward_time=1.086, grad_norm=116.810, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.018e-04, train_time=1.503 +[gpub001:0/128] 2023-07-02 11:53:54,354 (trainer:732) INFO: 4epoch:train:3401-3500batch: iter_time=8.940e-05, forward_time=0.146, loss_ctc=76.243, loss_att=61.089, acc=0.628, loss=65.635, backward_time=1.079, grad_norm=102.428, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.011e-04, train_time=1.485 +[gpub001:0/128] 2023-07-02 11:56:51,221 (trainer:732) INFO: 4epoch:train:3501-3600batch: iter_time=8.309e-05, forward_time=0.146, loss_ctc=90.417, loss_att=79.209, acc=0.635, loss=82.572, backward_time=1.115, grad_norm=107.842, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=2.005e-04, train_time=1.768 +[gpub001:0/128] 2023-07-02 11:56:54,712 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/128] 2023-07-02 11:57:16,883 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 11:57:21,231 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 11:57:21,231 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/128] 2023-07-02 11:57:21,235 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 12:04:28,387 (trainer:732) INFO: 4epoch:train:3601-3700batch: iter_time=1.574, forward_time=0.203, loss_ctc=87.050, loss_att=69.244, acc=0.619, loss=74.586, backward_time=1.115, grad_norm=102.912, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.123, optim0_lr0=1.998e-04, train_time=4.571 +[gpub001:0/128] 2023-07-02 12:06:58,225 (trainer:732) INFO: 4epoch:train:3701-3800batch: iter_time=9.426e-05, forward_time=0.147, loss_ctc=85.563, loss_att=72.811, acc=0.627, loss=76.637, backward_time=1.083, grad_norm=108.872, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=1.992e-04, train_time=1.499 +[gpub001:0/128] 2023-07-02 12:09:29,231 (trainer:732) INFO: 4epoch:train:3801-3900batch: iter_time=9.519e-05, forward_time=0.146, loss_ctc=76.919, loss_att=60.499, acc=0.633, loss=65.425, backward_time=1.079, grad_norm=100.011, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=1.986e-04, train_time=1.510 +[gpub001:0/128] 2023-07-02 12:11:59,799 (trainer:732) INFO: 4epoch:train:3901-4000batch: iter_time=8.595e-05, forward_time=0.146, loss_ctc=90.077, loss_att=77.617, acc=0.639, loss=81.355, backward_time=1.084, grad_norm=116.280, clip=100.000, loss_scale=8.389e+06, optim_step_time=0.121, optim0_lr0=1.979e-04, train_time=1.505 +[gpub001:0/128] 2023-07-02 12:21:57,381 (trainer:338) INFO: 4epoch results: [train] iter_time=0.456, forward_time=0.155, loss_ctc=88.178, loss_att=74.657, acc=0.605, loss=78.714, backward_time=1.092, grad_norm=115.865, clip=100.000, loss_scale=6.291e+06, optim_step_time=0.121, optim0_lr0=2.118e-04, train_time=2.175, time=2 hours, 25 minutes and 14.26 seconds, total_count=16000, gpu_max_cached_mem_GB=37.211, [valid] loss_ctc=85.064, cer_ctc=0.401, loss_att=69.007, acc=0.517, cer=0.516, wer=1.000, loss=73.824, time=3 minutes and 47.38 seconds, total_count=2024, gpu_max_cached_mem_GB=37.211, [att_plot] time=5 minutes and 57.81 seconds, total_count=0, gpu_max_cached_mem_GB=37.211 +[gpub001:0/128] 2023-07-02 12:22:12,814 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub001:0/128] 2023-07-02 12:22:12,816 (trainer:272) INFO: 5/100epoch started. Estimated time to finish: 1 week, 3 days and 16 hours +[gpub001:0/128] 2023-07-02 12:22:12,819 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/128] 2023-07-02 12:22:35,284 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 12:22:41,131 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 12:22:41,131 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/128] 2023-07-02 12:22:42,017 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 12:31:54,897 (trainer:732) INFO: 5epoch:train:1-100batch: iter_time=4.188, forward_time=0.167, loss_ctc=95.831, loss_att=83.013, acc=0.566, loss=86.858, backward_time=1.104, grad_norm=128.232, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.973e-04, train_time=5.821 +[gpub001:0/128] 2023-07-02 12:34:31,184 (trainer:732) INFO: 5epoch:train:101-200batch: iter_time=7.943e-05, forward_time=0.145, loss_ctc=86.928, loss_att=65.908, acc=0.615, loss=72.214, backward_time=1.097, grad_norm=107.199, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.967e-04, train_time=1.563 +[gpub001:0/128] 2023-07-02 12:37:14,909 (trainer:732) INFO: 5epoch:train:201-300batch: iter_time=8.068e-05, forward_time=0.145, loss_ctc=82.643, loss_att=66.057, acc=0.622, loss=71.033, backward_time=1.123, grad_norm=117.355, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.961e-04, train_time=1.637 +[gpub001:0/128] 2023-07-02 12:39:43,827 (trainer:732) INFO: 5epoch:train:301-400batch: iter_time=8.139e-05, forward_time=0.146, loss_ctc=95.498, loss_att=80.022, acc=0.595, loss=84.665, backward_time=1.081, grad_norm=144.339, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.955e-04, train_time=1.489 +[gpub001:0/128] 2023-07-02 12:39:51,380 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/128] 2023-07-02 12:40:12,992 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 12:40:17,133 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 12:40:17,133 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/128] 2023-07-02 12:40:17,137 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 12:48:12,918 (trainer:732) INFO: 5epoch:train:401-500batch: iter_time=1.579, forward_time=0.147, loss_ctc=88.461, loss_att=76.736, acc=0.586, loss=80.254, backward_time=1.098, grad_norm=113.460, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.949e-04, train_time=5.091 +[gpub001:0/128] 2023-07-02 12:50:45,684 (trainer:732) INFO: 5epoch:train:501-600batch: iter_time=1.028e-04, forward_time=0.147, loss_ctc=85.023, loss_att=63.231, acc=0.628, loss=69.769, backward_time=1.084, grad_norm=107.817, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.943e-04, train_time=1.527 +[gpub001:0/128] 2023-07-02 12:53:13,961 (trainer:732) INFO: 5epoch:train:601-700batch: iter_time=1.013e-04, forward_time=0.144, loss_ctc=80.852, loss_att=65.135, acc=0.627, loss=69.850, backward_time=1.076, grad_norm=121.490, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.937e-04, train_time=1.483 +[gpub001:0/128] 2023-07-02 12:55:42,736 (trainer:732) INFO: 5epoch:train:701-800batch: iter_time=9.932e-05, forward_time=0.145, loss_ctc=94.110, loss_att=78.547, acc=0.604, loss=83.216, backward_time=1.079, grad_norm=120.640, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.932e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 12:55:44,316 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/128] 2023-07-02 12:56:07,456 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 12:56:11,744 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.0", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 12:56:11,744 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.0, +[gpub001:0/128] 2023-07-02 12:56:11,748 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 13:02:22,172 (trainer:732) INFO: 5epoch:train:801-900batch: iter_time=1.509, forward_time=0.148, loss_ctc=88.112, loss_att=77.763, acc=0.577, loss=80.868, backward_time=1.137, grad_norm=106.985, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.926e-04, train_time=3.994 +[gpub001:0/128] 2023-07-02 13:04:50,899 (trainer:732) INFO: 5epoch:train:901-1000batch: iter_time=1.198e-04, forward_time=0.147, loss_ctc=83.503, loss_att=64.856, acc=0.623, loss=70.450, backward_time=1.080, grad_norm=105.383, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.920e-04, train_time=1.487 +[gpub001:0/128] 2023-07-02 13:07:19,780 (trainer:732) INFO: 5epoch:train:1001-1100batch: iter_time=1.212e-04, forward_time=0.148, loss_ctc=78.804, loss_att=62.053, acc=0.633, loss=67.078, backward_time=1.083, grad_norm=99.605, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.915e-04, train_time=1.489 +[gpub001:0/128] 2023-07-02 13:09:48,658 (trainer:732) INFO: 5epoch:train:1101-1200batch: iter_time=1.177e-04, forward_time=0.147, loss_ctc=91.446, loss_att=76.278, acc=0.600, loss=80.828, backward_time=1.080, grad_norm=117.116, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.909e-04, train_time=1.489 +[gpub001:0/128] 2023-07-02 13:09:50,669 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/128] 2023-07-02 13:10:12,649 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 13:10:17,106 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.4", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 13:10:17,106 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.4, +[gpub001:0/128] 2023-07-02 13:10:17,110 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 13:16:06,569 (trainer:732) INFO: 5epoch:train:1201-1300batch: iter_time=1.607, forward_time=0.175, loss_ctc=85.120, loss_att=72.798, acc=0.593, loss=76.495, backward_time=1.101, grad_norm=103.506, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.122, optim0_lr0=1.903e-04, train_time=3.779 +[gpub001:0/128] 2023-07-02 13:18:36,586 (trainer:732) INFO: 5epoch:train:1301-1400batch: iter_time=1.060e-04, forward_time=0.145, loss_ctc=84.666, loss_att=64.196, acc=0.625, loss=70.337, backward_time=1.081, grad_norm=125.999, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.898e-04, train_time=1.500 +[gpub001:0/128] 2023-07-02 13:21:05,506 (trainer:732) INFO: 5epoch:train:1401-1500batch: iter_time=1.041e-04, forward_time=0.148, loss_ctc=76.989, loss_att=60.444, acc=0.639, loss=65.407, backward_time=1.082, grad_norm=93.583, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.892e-04, train_time=1.489 +[gpub001:0/128] 2023-07-02 13:23:34,157 (trainer:732) INFO: 5epoch:train:1501-1600batch: iter_time=8.877e-05, forward_time=0.147, loss_ctc=91.522, loss_att=76.267, acc=0.603, loss=80.844, backward_time=1.081, grad_norm=103.188, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.887e-04, train_time=1.486 +[gpub001:0/128] 2023-07-02 13:23:54,184 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/128] 2023-07-02 13:24:16,462 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 13:24:20,736 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 13:24:20,736 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/128] 2023-07-02 13:24:20,752 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 13:30:23,399 (trainer:732) INFO: 5epoch:train:1601-1700batch: iter_time=2.283, forward_time=0.146, loss_ctc=86.174, loss_att=73.323, acc=0.604, loss=77.178, backward_time=1.124, grad_norm=110.109, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.882e-04, train_time=4.092 +[gpub001:0/128] 2023-07-02 13:32:53,039 (trainer:732) INFO: 5epoch:train:1701-1800batch: iter_time=1.105e-04, forward_time=0.146, loss_ctc=81.258, loss_att=60.193, acc=0.643, loss=66.513, backward_time=1.081, grad_norm=124.281, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.876e-04, train_time=1.496 +[gpub001:0/128] 2023-07-02 13:35:25,312 (trainer:732) INFO: 5epoch:train:1801-1900batch: iter_time=1.034e-04, forward_time=0.147, loss_ctc=76.781, loss_att=61.004, acc=0.646, loss=65.737, backward_time=1.083, grad_norm=106.544, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.871e-04, train_time=1.523 +[gpub001:0/128] 2023-07-02 13:38:08,039 (trainer:732) INFO: 5epoch:train:1901-2000batch: iter_time=1.047e-04, forward_time=0.147, loss_ctc=89.325, loss_att=74.875, acc=0.619, loss=79.210, backward_time=1.121, grad_norm=109.861, clip=100.000, loss_scale=1.678e+07, optim_step_time=0.121, optim0_lr0=1.866e-04, train_time=1.627 +[gpub001:0/128] 2023-07-02 13:38:12,719 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpub001:0/128] 2023-07-02 13:38:34,773 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 13:38:39,038 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 13:38:39,038 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/128] 2023-07-02 13:38:39,042 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 13:45:41,717 (trainer:732) INFO: 5epoch:train:2001-2100batch: iter_time=1.533, forward_time=0.146, loss_ctc=86.087, loss_att=72.796, acc=0.601, loss=76.784, backward_time=1.099, grad_norm=114.369, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.861e-04, train_time=4.537 +[gpub001:0/128] 2023-07-02 13:48:14,220 (trainer:732) INFO: 5epoch:train:2101-2200batch: iter_time=1.082e-04, forward_time=0.145, loss_ctc=81.831, loss_att=60.613, acc=0.644, loss=66.978, backward_time=1.088, grad_norm=131.370, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.856e-04, train_time=1.525 +[gpub001:0/128] 2023-07-02 13:50:51,032 (trainer:732) INFO: 5epoch:train:2201-2300batch: iter_time=1.024e-04, forward_time=0.146, loss_ctc=77.795, loss_att=60.560, acc=0.646, loss=65.730, backward_time=1.089, grad_norm=113.810, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.851e-04, train_time=1.568 +[gpub001:0/128] 2023-07-02 13:53:32,141 (trainer:732) INFO: 5epoch:train:2301-2400batch: iter_time=1.003e-04, forward_time=0.147, loss_ctc=89.325, loss_att=72.509, acc=0.624, loss=77.553, backward_time=1.096, grad_norm=104.473, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.845e-04, train_time=1.611 +[gpub001:0/128] 2023-07-02 13:53:33,805 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpub001:0/128] 2023-07-02 13:53:56,495 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 13:54:00,777 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 13:54:00,778 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/128] 2023-07-02 13:54:00,781 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 14:00:22,334 (trainer:732) INFO: 5epoch:train:2401-2500batch: iter_time=1.553, forward_time=0.186, loss_ctc=83.017, loss_att=72.643, acc=0.601, loss=75.755, backward_time=1.101, grad_norm=101.899, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.122, optim0_lr0=1.840e-04, train_time=4.102 +[gpub001:0/128] 2023-07-02 14:02:53,893 (trainer:732) INFO: 5epoch:train:2501-2600batch: iter_time=1.039e-04, forward_time=0.143, loss_ctc=82.586, loss_att=62.473, acc=0.636, loss=68.507, backward_time=1.082, grad_norm=109.136, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.835e-04, train_time=1.515 +[gpub001:0/128] 2023-07-02 14:05:22,448 (trainer:732) INFO: 5epoch:train:2601-2700batch: iter_time=1.084e-04, forward_time=0.144, loss_ctc=78.078, loss_att=60.349, acc=0.643, loss=65.668, backward_time=1.077, grad_norm=104.700, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.831e-04, train_time=1.485 +[gpub001:0/128] 2023-07-02 14:08:06,353 (trainer:732) INFO: 5epoch:train:2701-2800batch: iter_time=1.017e-04, forward_time=0.145, loss_ctc=88.355, loss_att=73.214, acc=0.612, loss=77.756, backward_time=1.092, grad_norm=114.943, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.826e-04, train_time=1.639 +[gpub001:0/128] 2023-07-02 14:08:14,829 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpub001:0/128] 2023-07-02 14:08:36,903 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 14:08:41,179 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.3", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 14:08:41,179 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.3, +[gpub001:0/128] 2023-07-02 14:08:41,183 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 14:16:05,822 (trainer:732) INFO: 5epoch:train:2801-2900batch: iter_time=1.639, forward_time=0.145, loss_ctc=84.011, loss_att=71.273, acc=0.614, loss=75.095, backward_time=1.103, grad_norm=125.591, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.821e-04, train_time=4.794 +[gpub001:0/128] 2023-07-02 14:18:39,318 (trainer:732) INFO: 5epoch:train:2901-3000batch: iter_time=2.634e-04, forward_time=0.168, loss_ctc=81.248, loss_att=59.851, acc=0.647, loss=66.270, backward_time=1.091, grad_norm=91.920, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.124, optim0_lr0=1.816e-04, train_time=1.535 +[gpub001:0/128] 2023-07-02 14:21:11,194 (trainer:732) INFO: 5epoch:train:3001-3100batch: iter_time=1.143e-04, forward_time=0.162, loss_ctc=75.675, loss_att=60.444, acc=0.652, loss=65.013, backward_time=1.084, grad_norm=93.222, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.811e-04, train_time=1.518 +[gpub001:0/128] 2023-07-02 14:23:48,791 (trainer:732) INFO: 5epoch:train:3101-3200batch: iter_time=1.141e-04, forward_time=0.164, loss_ctc=90.248, loss_att=72.932, acc=0.623, loss=78.127, backward_time=1.095, grad_norm=113.248, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.122, optim0_lr0=1.807e-04, train_time=1.576 +[gpub001:0/128] 2023-07-02 14:23:59,201 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpub001:0/128] 2023-07-02 14:24:21,746 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 14:24:26,057 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.6", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 14:24:26,057 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.6, +[gpub001:0/128] 2023-07-02 14:24:26,061 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 14:31:50,659 (trainer:732) INFO: 5epoch:train:3201-3300batch: iter_time=1.865, forward_time=0.167, loss_ctc=83.228, loss_att=69.985, acc=0.610, loss=73.958, backward_time=1.102, grad_norm=113.113, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.122, optim0_lr0=1.802e-04, train_time=4.818 +[gpub001:0/128] 2023-07-02 14:34:19,355 (trainer:732) INFO: 5epoch:train:3301-3400batch: iter_time=9.341e-05, forward_time=0.144, loss_ctc=80.645, loss_att=61.484, acc=0.643, loss=67.232, backward_time=1.079, grad_norm=94.831, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.797e-04, train_time=1.487 +[gpub001:0/128] 2023-07-02 14:36:52,892 (trainer:732) INFO: 5epoch:train:3401-3500batch: iter_time=9.983e-05, forward_time=0.145, loss_ctc=74.661, loss_att=58.553, acc=0.648, loss=63.385, backward_time=1.091, grad_norm=112.096, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.793e-04, train_time=1.535 +[gpub001:0/128] 2023-07-02 14:39:27,050 (trainer:732) INFO: 5epoch:train:3501-3600batch: iter_time=8.702e-05, forward_time=0.147, loss_ctc=87.700, loss_att=72.286, acc=0.618, loss=76.910, backward_time=1.091, grad_norm=108.539, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.788e-04, train_time=1.541 +[gpub001:0/128] 2023-07-02 14:39:34,517 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpub001:0/128] 2023-07-02 14:39:57,079 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 14:40:01,384 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.2", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 14:40:01,385 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.2, +[gpub001:0/128] 2023-07-02 14:40:01,388 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 14:47:22,407 (trainer:732) INFO: 5epoch:train:3601-3700batch: iter_time=2.338, forward_time=0.146, loss_ctc=82.381, loss_att=70.294, acc=0.609, loss=73.920, backward_time=1.111, grad_norm=95.828, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.783e-04, train_time=4.753 +[gpub001:0/128] 2023-07-02 14:49:58,572 (trainer:732) INFO: 5epoch:train:3701-3800batch: iter_time=1.275e-04, forward_time=0.147, loss_ctc=78.720, loss_att=59.946, acc=0.648, loss=65.578, backward_time=1.090, grad_norm=91.671, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.779e-04, train_time=1.561 +[gpub001:0/128] 2023-07-02 14:52:32,393 (trainer:732) INFO: 5epoch:train:3801-3900batch: iter_time=1.171e-04, forward_time=0.144, loss_ctc=76.291, loss_att=59.368, acc=0.648, loss=64.445, backward_time=1.080, grad_norm=102.358, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.774e-04, train_time=1.538 +[gpub001:0/128] 2023-07-02 14:55:03,373 (trainer:732) INFO: 5epoch:train:3901-4000batch: iter_time=1.187e-04, forward_time=0.146, loss_ctc=86.476, loss_att=71.726, acc=0.622, loss=76.151, backward_time=1.081, grad_norm=105.900, clip=100.000, loss_scale=3.355e+07, optim_step_time=0.121, optim0_lr0=1.770e-04, train_time=1.510 +[gpub001:0/128] 2023-07-02 15:04:31,604 (trainer:338) INFO: 5epoch results: [train] iter_time=0.502, forward_time=0.150, loss_ctc=84.285, loss_att=68.150, acc=0.621, loss=72.991, backward_time=1.092, grad_norm=110.243, clip=100.000, loss_scale=2.517e+07, optim_step_time=0.121, optim0_lr0=1.866e-04, train_time=2.292, time=2 hours, 33 minutes and 0.88 seconds, total_count=20000, gpu_max_cached_mem_GB=37.211, [valid] loss_ctc=72.987, cer_ctc=0.380, loss_att=58.253, acc=0.554, cer=0.483, wer=0.990, loss=62.673, time=3 minutes and 29.52 seconds, total_count=2530, gpu_max_cached_mem_GB=37.211, [att_plot] time=5 minutes and 48.38 seconds, total_count=0, gpu_max_cached_mem_GB=37.211 +[gpub001:0/128] 2023-07-02 15:04:47,134 (trainer:386) INFO: The best model has been updated: valid.acc, valid.total_count +[gpub001:0/128] 2023-07-02 15:04:47,137 (trainer:272) INFO: 6/100epoch started. Estimated time to finish: 1 week, 3 days and 14 hours +[gpub001:0/128] 2023-07-02 15:04:47,140 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpub001:0/128] 2023-07-02 15:05:08,659 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 15:05:12,886 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.9", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 15:05:12,886 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.9, +[gpub001:0/128] 2023-07-02 15:05:12,890 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 15:09:55,173 (trainer:732) INFO: 6epoch:train:1-100batch: iter_time=1.516, forward_time=0.173, loss_ctc=92.368, loss_att=76.121, acc=0.626, loss=80.995, backward_time=1.103, grad_norm=126.531, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.765e-04, train_time=3.080 +[gpub001:0/128] 2023-07-02 15:12:32,565 (trainer:732) INFO: 6epoch:train:101-200batch: iter_time=9.775e-05, forward_time=0.144, loss_ctc=83.143, loss_att=68.670, acc=0.613, loss=73.012, backward_time=1.090, grad_norm=97.187, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.761e-04, train_time=1.574 +[gpub001:0/128] 2023-07-02 15:15:24,315 (trainer:732) INFO: 6epoch:train:201-300batch: iter_time=1.016e-04, forward_time=0.145, loss_ctc=84.306, loss_att=66.722, acc=0.648, loss=71.997, backward_time=1.105, grad_norm=107.360, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.757e-04, train_time=1.717 +[gpub001:0/128] 2023-07-02 15:18:00,929 (trainer:732) INFO: 6epoch:train:301-400batch: iter_time=1.021e-04, forward_time=0.145, loss_ctc=91.913, loss_att=79.037, acc=0.636, loss=82.900, backward_time=1.090, grad_norm=110.113, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.752e-04, train_time=1.566 +[gpub001:0/128] 2023-07-02 15:18:17,251 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpub001:0/128] 2023-07-02 15:18:39,584 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 15:18:43,772 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.8", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 15:18:43,772 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.8, +[gpub001:0/128] 2023-07-02 15:18:43,861 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 15:25:56,982 (trainer:732) INFO: 6epoch:train:401-500batch: iter_time=2.079, forward_time=0.167, loss_ctc=92.693, loss_att=72.766, acc=0.624, loss=78.744, backward_time=1.096, grad_norm=118.287, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.748e-04, train_time=4.760 +[gpub001:0/128] 2023-07-02 15:28:27,102 (trainer:732) INFO: 6epoch:train:501-600batch: iter_time=1.140e-04, forward_time=0.145, loss_ctc=81.911, loss_att=68.694, acc=0.611, loss=72.659, backward_time=1.081, grad_norm=97.202, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.744e-04, train_time=1.501 +[gpub001:0/128] 2023-07-02 15:30:56,188 (trainer:732) INFO: 6epoch:train:601-700batch: iter_time=1.145e-04, forward_time=0.146, loss_ctc=83.558, loss_att=64.117, acc=0.651, loss=69.949, backward_time=1.080, grad_norm=106.948, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.740e-04, train_time=1.491 +[gpub001:0/128] 2023-07-02 15:33:25,358 (trainer:732) INFO: 6epoch:train:701-800batch: iter_time=1.124e-04, forward_time=0.145, loss_ctc=87.949, loss_att=76.437, acc=0.632, loss=79.891, backward_time=1.082, grad_norm=108.261, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.735e-04, train_time=1.491 +[gpub001:0/128] 2023-07-02 15:33:27,210 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpub001:0/128] 2023-07-02 15:33:49,343 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 15:33:53,609 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.1", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 15:33:53,609 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.1, +[gpub001:0/128] 2023-07-02 15:33:53,613 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 15:40:08,819 (trainer:732) INFO: 6epoch:train:801-900batch: iter_time=1.543, forward_time=0.145, loss_ctc=89.210, loss_att=70.814, acc=0.647, loss=76.333, backward_time=1.102, grad_norm=134.895, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.731e-04, train_time=4.034 +[gpub001:0/128] 2023-07-02 15:42:38,135 (trainer:732) INFO: 6epoch:train:901-1000batch: iter_time=1.010e-04, forward_time=0.145, loss_ctc=82.434, loss_att=67.557, acc=0.627, loss=72.020, backward_time=1.083, grad_norm=95.861, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.727e-04, train_time=1.493 +[gpub001:0/128] 2023-07-02 15:45:07,723 (trainer:732) INFO: 6epoch:train:1001-1100batch: iter_time=1.084e-04, forward_time=0.145, loss_ctc=81.188, loss_att=62.844, acc=0.667, loss=68.347, backward_time=1.083, grad_norm=93.331, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.723e-04, train_time=1.496 +[gpub001:0/128] 2023-07-02 15:47:38,402 (trainer:732) INFO: 6epoch:train:1101-1200batch: iter_time=1.003e-04, forward_time=0.147, loss_ctc=88.287, loss_att=74.520, acc=0.648, loss=78.650, backward_time=1.086, grad_norm=111.618, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.719e-04, train_time=1.507 +[gpub001:0/128] 2023-07-02 15:47:48,256 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpub001:0/128] 2023-07-02 15:48:09,974 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 15:48:14,373 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.7", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 15:48:14,373 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.7, +[gpub001:0/128] 2023-07-02 15:48:14,377 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 15:54:23,324 (trainer:732) INFO: 6epoch:train:1201-1300batch: iter_time=1.601, forward_time=0.159, loss_ctc=88.348, loss_att=73.412, acc=0.647, loss=77.893, backward_time=1.101, grad_norm=112.423, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.715e-04, train_time=4.049 +[gpub001:0/128] 2023-07-02 15:56:53,521 (trainer:732) INFO: 6epoch:train:1301-1400batch: iter_time=1.244e-04, forward_time=0.146, loss_ctc=79.386, loss_att=65.071, acc=0.634, loss=69.365, backward_time=1.082, grad_norm=105.505, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.711e-04, train_time=1.502 +[gpub001:0/128] 2023-07-02 15:59:22,392 (trainer:732) INFO: 6epoch:train:1401-1500batch: iter_time=1.308e-04, forward_time=0.148, loss_ctc=81.481, loss_att=62.703, acc=0.665, loss=68.336, backward_time=1.081, grad_norm=101.416, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.707e-04, train_time=1.488 +[gpub001:0/128] 2023-07-02 16:01:52,855 (trainer:732) INFO: 6epoch:train:1501-1600batch: iter_time=1.191e-04, forward_time=0.148, loss_ctc=87.263, loss_att=73.197, acc=0.653, loss=77.416, backward_time=1.084, grad_norm=107.970, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.703e-04, train_time=1.504 +[gpub001:0/128] 2023-07-02 16:01:58,402 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpub001:0/128] 2023-07-02 16:02:20,429 (s2t:454) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpub001:0/128] 2023-07-02 16:02:24,729 (abs_task:1570) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits10/text/split.5", "type": "text"} + preprocess: ) +[gpub001:0/128] 2023-07-02 16:02:24,729 (abs_task:1571) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=22796, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits10/speech_shape/split.5, +[gpub001:0/128] 2023-07-02 16:02:24,733 (abs_task:1572) INFO: [train] mini-batch sizes summary: N-batch=22796, mean=256.0, min=256, max=257 +[gpub001:0/128] 2023-07-02 16:07:49,090 (trainer:732) INFO: 6epoch:train:1601-1700batch: iter_time=1.671, forward_time=0.171, loss_ctc=90.466, loss_att=70.502, acc=0.651, loss=76.491, backward_time=1.106, grad_norm=124.483, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.123, optim0_lr0=1.699e-04, train_time=3.562 +[gpub001:0/128] 2023-07-02 16:10:20,476 (trainer:732) INFO: 6epoch:train:1701-1800batch: iter_time=9.879e-05, forward_time=0.146, loss_ctc=80.510, loss_att=64.788, acc=0.634, loss=69.505, backward_time=1.086, grad_norm=108.106, clip=100.000, loss_scale=6.711e+07, optim_step_time=0.121, optim0_lr0=1.695e-04, train_time=1.514 +/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/kaldiio/utils.py:481: UserWarning: An error happens at loading "dump/raw/org/GigaST/XL.en-de/data/format.49/data_wav.ark:1438393521" + warnings.warn('An error happens at loading "{}"'.format(ark_name)) +ERROR:root:Error happened with path=exp/s2t_stats_raw_bpe50000/splits10/wav.scp/split.5, type=kaldi_ark, id=GigaST_YOU0000008013_005722080_005750350_en_st_de +Process SpawnProcess-3: +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap + self.run() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1359, in main_worker + cls.trainer.run( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/reporter.py", line 267, in measure_iter_time + retval = next(iterator) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/iterators/multiple_iter_factory.py", line 35, in build_iter + yield from iter_factory.build_iter(epoch, shuffle) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 628, in __next__ + data = self._next_data() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1333, in _next_data + return self._process_data(data) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data + data.reraise() + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/_utils.py", line 543, in reraise + raise exception +PermissionError: Caught PermissionError in DataLoader worker process 1. +Original Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop + data = fetcher.fetch(index) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/dataset.py", line 513, in __getitem__ + value = loader[uid] + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/dataset.py", line 52, in __getitem__ + retval = self.loader[key] + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/kaldiio/utils.py", line 479, in __getitem__ + return self._loader(ark_name) + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/kaldiio/matio.py", line 235, in load_mat + fd_dict[ark] = open_like_kaldi(ark, "rb") + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/kaldiio/utils.py", line 207, in open_like_kaldi + return io.open(name, mode, encoding=encoding) +PermissionError: [Errno 13] Permission denied: 'dump/raw/org/GigaST/XL.en-de/data/format.49/data_wav.ark' + +Traceback (most recent call last): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in + main() + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main + S2TTask.main(cmd=cmd) + File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1104, in main + while not ProcessContext(processes, error_queues).join(): + File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 +srun: error: gpub040: task 14: Exited with exit code 1 +slurmstepd: error: *** STEP 2115302.0 ON gpub001 CANCELLED AT 2023-07-02T16:14:40 ***