diff --git "a/logs/main/main_log.txt" "b/logs/main/main_log.txt" new file mode 100644--- /dev/null +++ "b/logs/main/main_log.txt" @@ -0,0 +1,1571 @@ +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 8, data-parallel-size: 8, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:WARNING: overriding default arguments for tokenizer_type:GPT2BPETokenizer with tokenizer_type:PretrainedFromHF +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 8 +[default0]: data_path ....................................... ['/gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document'] +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.877170.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 1000 +[default0]: eval_iters ...................................... 100 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 1190 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 1024 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 512 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 256 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.1 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ None +[default0]: kv_channels ..................................... 32 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ None +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 10 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 0.001 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ 1953125 +[default0]: lr_decay_style .................................. cosine +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 183105 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 32 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: mup ............................................. True +[default0]: mup_attn_mult ................................... 10.0 +[default0]: mup_base_ffn_hidden_size ........................ 256 +[default0]: mup_base_hidden_size ............................ 64 +[default0]: mup_coord_check ................................. False +[default0]: mup_input_mult .................................. 10.0 +[default0]: mup_output_mult ................................. 10.0 +[default0]: no_load_optim ................................... None +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: num_attention_heads ............................. 8 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 32 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 51200 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]: save_interval ................................... 1000 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... 969, 30, 1 +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup-logs/tensorboard/main +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_names ....................... None +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: test_weighted_split_splits ...................... None +[default0]: test_weighted_split_weights ..................... None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... gpt2 +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 1953125 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_paths ...................... None +[default0]: train_weighted_split_paths_path ................. None +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... None +[default0]: valid_weighted_split_paths ...................... None +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... None +[default0]: valid_weighted_split_weights .................... None +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.1 +[default0]: world_size ...................................... 8 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 2 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Could not locate the tokenizer configuration file, will try to use the model config instead. +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading configuration file config.json from cache at /gpfswork/rech/ajs/commun/models/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json +[default0]:Model config GPT2Config { +[default0]: "_name_or_path": "gpt2", +[default0]: "activation_function": "gelu_new", +[default0]: "architectures": [ +[default0]: "GPT2LMHeadModel" +[default0]: ], +[default0]: "attn_pdrop": 0.1, +[default0]: "bos_token_id": 50256, +[default0]: "embd_pdrop": 0.1, +[default0]: "eos_token_id": 50256, +[default0]: "initializer_range": 0.02, +[default0]: "layer_norm_epsilon": 1e-05, +[default0]: "model_type": "gpt2", +[default0]: "n_ctx": 1024, +[default0]: "n_embd": 768, +[default0]: "n_head": 12, +[default0]: "n_inner": null, +[default0]: "n_layer": 12, +[default0]: "n_positions": 1024, +[default0]: "reorder_and_upcast_attn": false, +[default0]: "resid_pdrop": 0.1, +[default0]: "scale_attn_by_inverse_layer_idx": false, +[default0]: "scale_attn_weights": true, +[default0]: "summary_activation": null, +[default0]: "summary_first_dropout": 0.1, +[default0]: "summary_proj_to_labels": true, +[default0]: "summary_type": "cls_index", +[default0]: "summary_use_proj": true, +[default0]: "task_specific_params": { +[default0]: "text-generation": { +[default0]: "do_sample": true, +[default0]: "max_length": 50 +[default0]: } +[default0]: }, +[default0]: "transformers_version": "4.25.0.dev0", +[default0]: "use_cache": true, +[default0]: "vocab_size": 50257 +[default0]:} +[default0]: +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file vocab.json from cache at /gpfswork/rech/ajs/commun/models/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json +[default0]:loading file merges.txt from cache at /gpfswork/rech/ajs/commun/models/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/ajs/commun/models/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at None +[default0]:loading file tokenizer_config.json from cache at None +[default0]:loading configuration file config.json from cache at /gpfswork/rech/ajs/commun/models/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json +[default0]:Model config GPT2Config { +[default0]: "_name_or_path": "gpt2", +[default0]: "activation_function": "gelu_new", +[default0]: "architectures": [ +[default0]: "GPT2LMHeadModel" +[default0]: ], +[default0]: "attn_pdrop": 0.1, +[default0]: "bos_token_id": 50256, +[default0]: "embd_pdrop": 0.1, +[default0]: "eos_token_id": 50256, +[default0]: "initializer_range": 0.02, +[default0]: "layer_norm_epsilon": 1e-05, +[default0]: "model_type": "gpt2", +[default0]: "n_ctx": 1024, +[default0]: "n_embd": 768, +[default0]: "n_head": 12, +[default0]: "n_inner": null, +[default0]: "n_layer": 12, +[default0]: "n_positions": 1024, +[default0]: "reorder_and_upcast_attn": false, +[default0]: "resid_pdrop": 0.1, +[default0]: "scale_attn_by_inverse_layer_idx": false, +[default0]: "scale_attn_weights": true, +[default0]: "summary_activation": null, +[default0]: "summary_first_dropout": 0.1, +[default0]: "summary_proj_to_labels": true, +[default0]: "summary_type": "cls_index", +[default0]: "summary_use_proj": true, +[default0]: "task_specific_params": { +[default0]: "text-generation": { +[default0]: "do_sample": true, +[default0]: "max_length": 50 +[default0]: } +[default0]: }, +[default0]: "transformers_version": "4.25.0.dev0", +[default0]: "use_cache": true, +[default0]: "vocab_size": 50257 +[default0]:} +[default0]: +[default0]: > padded vocab (size: 50257) with 943 dummy tokens (new size: 51200) +[default7]:> setting tensorboard ... +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr11-176B-ml/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.11.0+cu115 +[default0]:torch cuda version ............... 11.5 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfsssd/worksf/projects/rech/six/commun/code/tr11-176B-ml/DeepSpeed/deepspeed'] +[default0]:deepspeed info ................... 0.7.2+2a644488, 2a644488, master +[default0]:deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5 +[default0]:**** Git info for Megatron: git_hash=ef0c6b8 git_branch=mup_integration **** +[default0]:> initializing torch distributed ... +[default0]:[2023-02-03 07:46:19,731] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default0]:> initializing tensor model parallel with size 1 +[default0]:> initializing pipeline model parallel with size 1 +[default0]:> setting random seeds to 42 ... +[default0]:[2023-02-03 07:46:20,090] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsdswork/projects/rech/ajs/commun/code/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsdswork/projects/rech/ajs/commun/code/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.055 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsdswork/projects/rech/ajs/commun/code/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsdswork/projects/rech/ajs/commun/code/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsdswork/projects/rech/ajs/commun/code/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 8.501 seconds +[default0]:time to initialize megatron (seconds): 68.647 +[default0]:[after megatron is initialized] datetime: 2023-02-03 07:46:28 +[default0]:building GPT model ... +[default0]:[2023-02-03 07:46:28,774] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2023-02-03 07:46:28,775] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:28,775] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 48.44 GB, percent = 9.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7} +[default0]:[2023-02-03 07:46:29,003] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +[default0]:stage=0 layers=41 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: ParallelTransformerLayerPipe +[default0]: 28: ParallelTransformerLayerPipe +[default0]: 29: ParallelTransformerLayerPipe +[default0]: 30: ParallelTransformerLayerPipe +[default0]: 31: ParallelTransformerLayerPipe +[default0]: 32: ParallelTransformerLayerPipe +[default0]: 33: ParallelTransformerLayerPipe +[default0]: 34: ParallelTransformerLayerPipe +[default0]: 35: ParallelTransformerLayerPipe +[default0]: 36: undo +[default0]: 37: MixedFusedLayerNorm +[default0]: 38: +[default0]: 39: EmbeddingPipe +[default0]: 40: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2023-02-03 07:46:29,181] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2023-02-03 07:46:29,182] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.11 GB CA 0.12 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:29,182] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 48.94 GB, percent = 9.7% +[default0]:building GPT model ... +[default0]:[2023-02-03 07:46:29,275] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2023-02-03 07:46:29,276] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.12 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:29,276] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 48.94 GB, percent = 9.7% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7} +[default0]:[2023-02-03 07:46:29,504] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +[default0]:stage=0 layers=41 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: ParallelTransformerLayerPipe +[default0]: 28: ParallelTransformerLayerPipe +[default0]: 29: ParallelTransformerLayerPipe +[default0]: 30: ParallelTransformerLayerPipe +[default0]: 31: ParallelTransformerLayerPipe +[default0]: 32: ParallelTransformerLayerPipe +[default0]: 33: ParallelTransformerLayerPipe +[default0]: 34: ParallelTransformerLayerPipe +[default0]: 35: ParallelTransformerLayerPipe +[default0]: 36: undo +[default0]: 37: MixedFusedLayerNorm +[default0]: 38: +[default0]: 39: EmbeddingPipe +[default0]: 40: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2023-02-03 07:46:29,647] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2023-02-03 07:46:29,648] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.13 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:29,648] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 48.96 GB, percent = 9.7% +[default0]:building GPT model ... +[default0]:[2023-02-03 07:46:29,745] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2023-02-03 07:46:29,746] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.12 GB CA 0.13 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:29,746] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 48.96 GB, percent = 9.7% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7} +[default0]:[2023-02-03 07:46:29,973] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +[default0]:stage=0 layers=41 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: ParallelTransformerLayerPipe +[default0]: 28: ParallelTransformerLayerPipe +[default0]: 29: ParallelTransformerLayerPipe +[default0]: 30: ParallelTransformerLayerPipe +[default0]: 31: ParallelTransformerLayerPipe +[default0]: 32: ParallelTransformerLayerPipe +[default0]: 33: ParallelTransformerLayerPipe +[default0]: 34: ParallelTransformerLayerPipe +[default0]: 35: ParallelTransformerLayerPipe +[default0]: 36: undo +[default0]: 37: MixedFusedLayerNorm +[default0]: 38: +[default0]: 39: EmbeddingPipe +[default0]: 40: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2023-02-03 07:46:30,120] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2023-02-03 07:46:30,121] [INFO] [utils.py:828:see_memory_usage] MA 0.14 GB Max_MA 0.15 GB CA 0.15 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:30,121] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 48.98 GB, percent = 9.7% +[default0]:setting training iterations to 3814 +[default0]:> learning rate decay style: cosine +[default0]:DeepSpeed is enabled. +[default0]:[2023-02-03 07:46:30,150] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.2+2a644488, git-hash=2a644488, git-branch=master +[default0]:[2023-02-03 07:46:32,392] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2023-02-03 07:46:32,393] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2023-02-03 07:46:32,393] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2023-02-03 07:46:32,405] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2023-02-03 07:46:32,405] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2023-02-03 07:46:32,405] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2023-02-03 07:46:32,405] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2023-02-03 07:46:32,405] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2023-02-03 07:46:32,405] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2023-02-03 07:46:32,405] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default3]:Rank: 3 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default0]:Rank: 0 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default5]:Rank: 5 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default4]:Rank: 4 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default2]:Rank: 2 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default6]:Rank: 6 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default1]:Rank: 1 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default7]:Rank: 7 partition count [8, 8, 8, 8] and sizes[(1540096, False), (1638400, False), (1605632, False), (13440, False)] +[default0]:[2023-02-03 07:46:32,610] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2023-02-03 07:46:32,611] [INFO] [utils.py:828:see_memory_usage] MA 0.12 GB Max_MA 0.17 GB CA 0.23 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:32,611] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 49.71 GB, percent = 9.9% +[default0]:[2023-02-03 07:46:32,731] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2023-02-03 07:46:32,732] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.18 GB CA 0.27 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:32,732] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 49.97 GB, percent = 9.9% +[default0]:[2023-02-03 07:46:32,732] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2023-02-03 07:46:32,826] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2023-02-03 07:46:32,826] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.27 GB Max_CA 0 GB +[default0]:[2023-02-03 07:46:32,826] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 49.97 GB, percent = 9.9% +[default0]:[2023-02-03 07:46:32,826] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2023-02-03 07:46:32,826] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2023-02-03 07:46:32,826] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2023-02-03 07:46:32,827] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2023-02-03 07:46:32,827] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2023-02-03 07:46:32,827] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2023-02-03 07:46:32,827] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2023-02-03 07:46:32,827] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2023-02-03 07:46:32,827] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] gradient_accumulation_steps .. 2 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2023-02-03 07:46:32,828] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] train_batch_size ............. 512 +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 32 +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] world_size ................... 8 +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2023-02-03 07:46:32,829] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 32, +[default0]: "train_batch_size": 512, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:[2023-02-03 07:46:32,829] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=2 micro_batch_size=32 +[default0]:[2023-02-03 07:46:32,873] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=41 [0, 41) STAGE_PARAMS=38380544 (38.381M) TOTAL_PARAMS=38380544 (38.381M) UNIQUE_PARAMS=38380544 (38.381M) +[default0]:estimated model parameters: 0.038380544 +[default0]:estimated model parameters without embeddings: 0.025273344 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2023-02-03 07:46:32 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 1953125 +[default0]: validation: 204800 +[default0]: test: 51200 +[default0]:> building train, validation, and test datasets for GPT ... +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017213 seconds +[default0]: number of documents: 364868892 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 353557956) total of 353557956 documents +[default0]: validation: +[default0]: document indices in [353557956, 364504023) total of 10946067 documents +[default0]: test: +[default0]: document indices in [364504023, 364868892) total of 364869 documents +[default0]:/gpfsdswork/projects/rech/ajs/commun/code/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]: > loading doc-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_train_indexmap_1953125ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_train_indexmap_1953125ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_train_indexmap_1953125ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: total number of samples: 82627993 +[default0]: total number of epochs: 1 +[default0]: > loading doc-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_valid_indexmap_204800ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_valid_indexmap_204800ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_valid_indexmap_204800ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.036 seconds +[default0]: total number of samples: 2558380 +[default0]: total number of epochs: 1 +[default0]: > loading doc-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_test_indexmap_51200ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_test_indexmap_51200ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsscratch/rech/ajs/commun/datasets/c4/gpt2tok_c4_text_document_test_indexmap_51200ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.010 seconds +[default0]: total number of samples: 84955 +[default0]: total number of epochs: 1 +[default0]:> finished creating GPT datasets ... +[default7]:time (ms) | model-and-optimizer-setup: 4178.33 | train/valid/test-data-iterators-setup: 11291.76 +[default0]:[after dataloaders are built] datetime: 2023-02-03 07:46:44 +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 0.0384B / 0.0253B +[default0]:[before the start of training step] datetime: 2023-02-03 07:46:44 +[default0]:[2023-02-03 07:46:44,478] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information +[default0]:[2023-02-03 07:46:44,479] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False +[default0]:[2023-02-03 07:46:44,479] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 32 total layers +[default0]:[2023-02-03 07:46:44,479] [INFO] [checkpointing.py:554:forward] ----Synchronization False +[default0]:[2023-02-03 07:46:44,479] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False +[default0]:[Rank 0] (after 10 iterations) memory (MB) | allocated: 25773.11376953125 | max allocated: 47141.99267578125 | reserved: 54520.0 | max reserved: 54520.0 +[default7]: iteration 10/ 3814 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 3.60 | learning rate: 1.398E-05 | global batch size: 512 | lm loss: 1.084034E+01 | loss scale: 4096.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 142.408 | TFLOPs: 19.99 | +[default7]: iteration 20/ 3814 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 3.50 | learning rate: 2.796E-05 | global batch size: 512 | lm loss: 1.081202E+01 | loss scale: 4096.0 | grad norm: 0.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.397 | TFLOPs: 20.55 | +[default7]: iteration 30/ 3814 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 3.49 | learning rate: 4.194E-05 | global batch size: 512 | lm loss: 1.078011E+01 | loss scale: 4096.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.529 | TFLOPs: 20.57 | +[default7]: iteration 40/ 3814 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 3.50 | learning rate: 5.592E-05 | global batch size: 512 | lm loss: 1.075732E+01 | loss scale: 4096.0 | grad norm: 0.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.462 | TFLOPs: 20.56 | +[default7]: iteration 50/ 3814 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 3.50 | learning rate: 6.991E-05 | global batch size: 512 | lm loss: 1.073784E+01 | loss scale: 4096.0 | grad norm: 0.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.182 | TFLOPs: 20.52 | +[default7]: iteration 60/ 3814 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 3.50 | learning rate: 8.389E-05 | global batch size: 512 | lm loss: 1.071627E+01 | loss scale: 4096.0 | grad norm: 0.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.288 | TFLOPs: 20.54 | +[default7]: iteration 70/ 3814 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 3.50 | learning rate: 9.787E-05 | global batch size: 512 | lm loss: 1.069011E+01 | loss scale: 4096.0 | grad norm: 0.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.387 | TFLOPs: 20.55 | +[default7]: iteration 80/ 3814 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 3.50 | learning rate: 1.118E-04 | global batch size: 512 | lm loss: 1.066068E+01 | loss scale: 4096.0 | grad norm: 0.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.285 | TFLOPs: 20.54 | +[default7]: iteration 90/ 3814 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 3.50 | learning rate: 1.258E-04 | global batch size: 512 | lm loss: 1.062674E+01 | loss scale: 4096.0 | grad norm: 0.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.440 | TFLOPs: 20.56 | +[default7]: iteration 100/ 3814 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 3.50 | learning rate: 1.398E-04 | global batch size: 512 | lm loss: 1.059065E+01 | loss scale: 4096.0 | grad norm: 0.057 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.492 | TFLOPs: 20.57 | +[default7]: iteration 110/ 3814 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 3.50 | learning rate: 1.538E-04 | global batch size: 512 | lm loss: 1.055071E+01 | loss scale: 4096.0 | grad norm: 0.059 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.439 | TFLOPs: 20.56 | +[default7]: iteration 120/ 3814 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 3.50 | learning rate: 1.678E-04 | global batch size: 512 | lm loss: 1.050794E+01 | loss scale: 4096.0 | grad norm: 0.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.440 | TFLOPs: 20.56 | +[default7]: iteration 130/ 3814 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 3.50 | learning rate: 1.818E-04 | global batch size: 512 | lm loss: 1.045973E+01 | loss scale: 4096.0 | grad norm: 0.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.433 | TFLOPs: 20.56 | +[default7]: iteration 140/ 3814 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 3.50 | learning rate: 1.957E-04 | global batch size: 512 | lm loss: 1.040801E+01 | loss scale: 4096.0 | grad norm: 0.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.433 | TFLOPs: 20.56 | +[default7]: iteration 150/ 3814 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 3.50 | learning rate: 2.097E-04 | global batch size: 512 | lm loss: 1.035154E+01 | loss scale: 4096.0 | grad norm: 0.070 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.400 | TFLOPs: 20.55 | +[default7]: iteration 160/ 3814 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 3.50 | learning rate: 2.237E-04 | global batch size: 512 | lm loss: 1.029051E+01 | loss scale: 4096.0 | grad norm: 0.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.388 | TFLOPs: 20.55 | +[default7]: iteration 170/ 3814 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 3.50 | learning rate: 2.377E-04 | global batch size: 512 | lm loss: 1.022570E+01 | loss scale: 4096.0 | grad norm: 0.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.362 | TFLOPs: 20.55 | +[default7]: iteration 180/ 3814 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 3.50 | learning rate: 2.517E-04 | global batch size: 512 | lm loss: 1.015609E+01 | loss scale: 4096.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.398 | TFLOPs: 20.55 | +[default7]: iteration 190/ 3814 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 3.50 | learning rate: 2.656E-04 | global batch size: 512 | lm loss: 1.008189E+01 | loss scale: 4096.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.440 | TFLOPs: 20.56 | +[default7]: iteration 200/ 3814 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 3.50 | learning rate: 2.796E-04 | global batch size: 512 | lm loss: 1.000351E+01 | loss scale: 4096.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.406 | TFLOPs: 20.55 | +[default7]: iteration 210/ 3814 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 3.50 | learning rate: 2.936E-04 | global batch size: 512 | lm loss: 9.921559E+00 | loss scale: 4096.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.442 | TFLOPs: 20.56 | +[default7]: iteration 220/ 3814 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 3.50 | learning rate: 3.076E-04 | global batch size: 512 | lm loss: 9.836179E+00 | loss scale: 4096.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.402 | TFLOPs: 20.55 | +[default7]: iteration 230/ 3814 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 3.50 | learning rate: 3.216E-04 | global batch size: 512 | lm loss: 9.749045E+00 | loss scale: 4096.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.334 | TFLOPs: 20.54 | +[default7]: iteration 240/ 3814 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 3.50 | learning rate: 3.355E-04 | global batch size: 512 | lm loss: 9.655286E+00 | loss scale: 4096.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.345 | TFLOPs: 20.55 | +[default7]: iteration 250/ 3814 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 3.50 | learning rate: 3.495E-04 | global batch size: 512 | lm loss: 9.562415E+00 | loss scale: 4096.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.319 | TFLOPs: 20.54 | +[default7]: iteration 260/ 3814 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 3.50 | learning rate: 3.635E-04 | global batch size: 512 | lm loss: 9.463052E+00 | loss scale: 4096.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.300 | TFLOPs: 20.54 | +[default7]: iteration 270/ 3814 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 3.50 | learning rate: 3.775E-04 | global batch size: 512 | lm loss: 9.363403E+00 | loss scale: 4096.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.348 | TFLOPs: 20.55 | +[default7]: iteration 280/ 3814 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 3.50 | learning rate: 3.915E-04 | global batch size: 512 | lm loss: 9.259320E+00 | loss scale: 4096.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.267 | TFLOPs: 20.53 | +[default7]: iteration 290/ 3814 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 3.50 | learning rate: 4.055E-04 | global batch size: 512 | lm loss: 9.155840E+00 | loss scale: 4096.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.333 | TFLOPs: 20.54 | +[default7]: iteration 300/ 3814 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 3.50 | learning rate: 4.194E-04 | global batch size: 512 | lm loss: 9.047672E+00 | loss scale: 4096.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.417 | TFLOPs: 20.56 | +[default7]: iteration 310/ 3814 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 3.50 | learning rate: 4.334E-04 | global batch size: 512 | lm loss: 8.940772E+00 | loss scale: 4096.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.379 | TFLOPs: 20.55 | +[default7]: iteration 320/ 3814 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 3.50 | learning rate: 4.474E-04 | global batch size: 512 | lm loss: 8.830733E+00 | loss scale: 4096.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.428 | TFLOPs: 20.56 | +[default7]: iteration 330/ 3814 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 3.50 | learning rate: 4.614E-04 | global batch size: 512 | lm loss: 8.723718E+00 | loss scale: 4096.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.428 | TFLOPs: 20.56 | +[default7]: iteration 340/ 3814 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 3.50 | learning rate: 4.754E-04 | global batch size: 512 | lm loss: 8.613307E+00 | loss scale: 4096.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.440 | TFLOPs: 20.56 | +[default7]: iteration 350/ 3814 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 3.50 | learning rate: 4.893E-04 | global batch size: 512 | lm loss: 8.507584E+00 | loss scale: 4096.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.442 | TFLOPs: 20.56 | +[default7]: iteration 360/ 3814 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 3.50 | learning rate: 5.000E-04 | global batch size: 512 | lm loss: 8.391431E+00 | loss scale: 4096.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.414 | TFLOPs: 20.56 | +[default7]: iteration 370/ 3814 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 3.50 | learning rate: 5.000E-04 | global batch size: 512 | lm loss: 8.283817E+00 | loss scale: 4096.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.376 | TFLOPs: 20.55 | +[default7]: iteration 380/ 3814 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 3.50 | learning rate: 4.999E-04 | global batch size: 512 | lm loss: 8.181972E+00 | loss scale: 4096.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.347 | TFLOPs: 20.55 | +[default7]: iteration 390/ 3814 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 3.50 | learning rate: 4.999E-04 | global batch size: 512 | lm loss: 8.086014E+00 | loss scale: 4096.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.187 | TFLOPs: 20.52 | +[default7]: iteration 400/ 3814 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 3.50 | learning rate: 4.998E-04 | global batch size: 512 | lm loss: 7.996132E+00 | loss scale: 4096.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.189 | TFLOPs: 20.52 | +[default7]: iteration 410/ 3814 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 3.50 | learning rate: 4.997E-04 | global batch size: 512 | lm loss: 7.908957E+00 | loss scale: 4096.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.285 | TFLOPs: 20.54 | +[default7]: iteration 420/ 3814 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 3.50 | learning rate: 4.996E-04 | global batch size: 512 | lm loss: 7.818993E+00 | loss scale: 4096.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.370 | TFLOPs: 20.55 | +[default7]: iteration 430/ 3814 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 3.50 | learning rate: 4.995E-04 | global batch size: 512 | lm loss: 7.741847E+00 | loss scale: 4096.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.325 | TFLOPs: 20.54 | +[default7]: iteration 440/ 3814 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 3.50 | learning rate: 4.993E-04 | global batch size: 512 | lm loss: 7.668082E+00 | loss scale: 4096.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.303 | TFLOPs: 20.54 | +[default7]: iteration 450/ 3814 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 3.50 | learning rate: 4.991E-04 | global batch size: 512 | lm loss: 7.598717E+00 | loss scale: 4096.0 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.277 | TFLOPs: 20.54 | +[default7]: iteration 460/ 3814 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 3.50 | learning rate: 4.989E-04 | global batch size: 512 | lm loss: 7.524449E+00 | loss scale: 4096.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.295 | TFLOPs: 20.54 | +[default7]: iteration 470/ 3814 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 3.50 | learning rate: 4.987E-04 | global batch size: 512 | lm loss: 7.464769E+00 | loss scale: 4096.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.285 | TFLOPs: 20.54 | +[default7]: iteration 480/ 3814 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 3.50 | learning rate: 4.985E-04 | global batch size: 512 | lm loss: 7.400432E+00 | loss scale: 4096.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.299 | TFLOPs: 20.54 | +[default7]: iteration 490/ 3814 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 3.50 | learning rate: 4.982E-04 | global batch size: 512 | lm loss: 7.342271E+00 | loss scale: 4096.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.345 | TFLOPs: 20.55 | +[default7]: iteration 500/ 3814 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 3.50 | learning rate: 4.979E-04 | global batch size: 512 | lm loss: 7.277428E+00 | loss scale: 8192.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.360 | TFLOPs: 20.55 | +[default7]: iteration 510/ 3814 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 3.50 | learning rate: 4.976E-04 | global batch size: 512 | lm loss: 7.235657E+00 | loss scale: 8192.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.341 | TFLOPs: 20.55 | +[default7]: iteration 520/ 3814 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 3.50 | learning rate: 4.973E-04 | global batch size: 512 | lm loss: 7.179829E+00 | loss scale: 8192.0 | grad norm: 0.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.305 | TFLOPs: 20.54 | +[default7]: iteration 530/ 3814 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 3.50 | learning rate: 4.969E-04 | global batch size: 512 | lm loss: 7.136535E+00 | loss scale: 8192.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.327 | TFLOPs: 20.54 | +[default7]: iteration 540/ 3814 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 3.50 | learning rate: 4.966E-04 | global batch size: 512 | lm loss: 7.093374E+00 | loss scale: 8192.0 | grad norm: 0.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.364 | TFLOPs: 20.55 | +[default7]: iteration 550/ 3814 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 3.50 | learning rate: 4.962E-04 | global batch size: 512 | lm loss: 7.036620E+00 | loss scale: 8192.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.290 | TFLOPs: 20.54 | +[default7]: iteration 560/ 3814 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 3.50 | learning rate: 4.958E-04 | global batch size: 512 | lm loss: 6.993396E+00 | loss scale: 8192.0 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.328 | TFLOPs: 20.54 | +[default7]: iteration 570/ 3814 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 3.50 | learning rate: 4.954E-04 | global batch size: 512 | lm loss: 6.941160E+00 | loss scale: 8192.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.333 | TFLOPs: 20.54 | +[default7]: iteration 580/ 3814 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 3.50 | learning rate: 4.949E-04 | global batch size: 512 | lm loss: 6.907193E+00 | loss scale: 8192.0 | grad norm: 0.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.298 | TFLOPs: 20.54 | +[default7]: iteration 590/ 3814 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 3.50 | learning rate: 4.944E-04 | global batch size: 512 | lm loss: 6.860229E+00 | loss scale: 8192.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.252 | TFLOPs: 20.53 | +[default7]: iteration 600/ 3814 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 3.50 | learning rate: 4.940E-04 | global batch size: 512 | lm loss: 6.816914E+00 | loss scale: 8192.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.245 | TFLOPs: 20.53 | +[default7]: iteration 610/ 3814 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 3.50 | learning rate: 4.935E-04 | global batch size: 512 | lm loss: 6.782854E+00 | loss scale: 8192.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.182 | TFLOPs: 20.52 | +[default7]: iteration 620/ 3814 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 3.50 | learning rate: 4.929E-04 | global batch size: 512 | lm loss: 6.747314E+00 | loss scale: 8192.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.193 | TFLOPs: 20.52 | +[default7]: iteration 630/ 3814 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 3.50 | learning rate: 4.924E-04 | global batch size: 512 | lm loss: 6.699776E+00 | loss scale: 8192.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.191 | TFLOPs: 20.52 | +[default7]: iteration 640/ 3814 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 3.50 | learning rate: 4.918E-04 | global batch size: 512 | lm loss: 6.675526E+00 | loss scale: 8192.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.214 | TFLOPs: 20.53 | +[default7]: iteration 650/ 3814 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 3.50 | learning rate: 4.912E-04 | global batch size: 512 | lm loss: 6.638416E+00 | loss scale: 8192.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.219 | TFLOPs: 20.53 | +[default7]: iteration 660/ 3814 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 3.50 | learning rate: 4.906E-04 | global batch size: 512 | lm loss: 6.600559E+00 | loss scale: 8192.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.266 | TFLOPs: 20.53 | +[default7]: iteration 670/ 3814 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 3.50 | learning rate: 4.900E-04 | global batch size: 512 | lm loss: 6.571600E+00 | loss scale: 8192.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.227 | TFLOPs: 20.53 | +[default7]: iteration 680/ 3814 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 3.50 | learning rate: 4.893E-04 | global batch size: 512 | lm loss: 6.532260E+00 | loss scale: 8192.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.225 | TFLOPs: 20.53 | +[default7]: iteration 690/ 3814 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 3.50 | learning rate: 4.887E-04 | global batch size: 512 | lm loss: 6.499902E+00 | loss scale: 8192.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.274 | TFLOPs: 20.54 | +[default7]: iteration 700/ 3814 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 3.50 | learning rate: 4.880E-04 | global batch size: 512 | lm loss: 6.463818E+00 | loss scale: 8192.0 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.205 | TFLOPs: 20.53 | +[default7]: iteration 710/ 3814 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 3.50 | learning rate: 4.873E-04 | global batch size: 512 | lm loss: 6.446008E+00 | loss scale: 8192.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.223 | TFLOPs: 20.53 | +[default7]: iteration 720/ 3814 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 3.50 | learning rate: 4.866E-04 | global batch size: 512 | lm loss: 6.418660E+00 | loss scale: 8192.0 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.219 | TFLOPs: 20.53 | +[default7]: iteration 730/ 3814 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 3.50 | learning rate: 4.858E-04 | global batch size: 512 | lm loss: 6.398177E+00 | loss scale: 8192.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.267 | TFLOPs: 20.53 | +[default7]: iteration 740/ 3814 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 3.50 | learning rate: 4.851E-04 | global batch size: 512 | lm loss: 6.367193E+00 | loss scale: 8192.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.262 | TFLOPs: 20.53 | +[default7]: iteration 750/ 3814 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 3.50 | learning rate: 4.843E-04 | global batch size: 512 | lm loss: 6.341607E+00 | loss scale: 8192.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.233 | TFLOPs: 20.53 | +[default7]: iteration 760/ 3814 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 3.50 | learning rate: 4.835E-04 | global batch size: 512 | lm loss: 6.330473E+00 | loss scale: 8192.0 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.208 | TFLOPs: 20.53 | +[default7]: iteration 770/ 3814 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 3.50 | learning rate: 4.827E-04 | global batch size: 512 | lm loss: 6.297069E+00 | loss scale: 8192.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.233 | TFLOPs: 20.53 | +[default7]: iteration 780/ 3814 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 3.50 | learning rate: 4.818E-04 | global batch size: 512 | lm loss: 6.281359E+00 | loss scale: 8192.0 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.209 | TFLOPs: 20.53 | +[default7]: iteration 790/ 3814 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 3.50 | learning rate: 4.809E-04 | global batch size: 512 | lm loss: 6.264540E+00 | loss scale: 8192.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.222 | TFLOPs: 20.53 | +[default7]: iteration 800/ 3814 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 3.50 | learning rate: 4.801E-04 | global batch size: 512 | lm loss: 6.237301E+00 | loss scale: 8192.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.210 | TFLOPs: 20.53 | +[default7]: iteration 810/ 3814 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 3.50 | learning rate: 4.792E-04 | global batch size: 512 | lm loss: 6.216414E+00 | loss scale: 8192.0 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.190 | TFLOPs: 20.52 | +[default7]: iteration 820/ 3814 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 3.50 | learning rate: 4.783E-04 | global batch size: 512 | lm loss: 6.205946E+00 | loss scale: 8192.0 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.178 | TFLOPs: 20.52 | +[default7]: iteration 830/ 3814 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 3.50 | learning rate: 4.773E-04 | global batch size: 512 | lm loss: 6.188189E+00 | loss scale: 8192.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.196 | TFLOPs: 20.52 | +[default7]: iteration 840/ 3814 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 3.50 | learning rate: 4.764E-04 | global batch size: 512 | lm loss: 6.161731E+00 | loss scale: 8192.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.155 | TFLOPs: 20.52 | +[default7]: iteration 850/ 3814 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 3.50 | learning rate: 4.754E-04 | global batch size: 512 | lm loss: 6.163130E+00 | loss scale: 8192.0 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.234 | TFLOPs: 20.53 | +[default7]: iteration 860/ 3814 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 3.50 | learning rate: 4.744E-04 | global batch size: 512 | lm loss: 6.134755E+00 | loss scale: 8192.0 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.227 | TFLOPs: 20.53 | +[default7]: iteration 870/ 3814 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 3.50 | learning rate: 4.734E-04 | global batch size: 512 | lm loss: 6.119118E+00 | loss scale: 8192.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.268 | TFLOPs: 20.53 | +[default7]: iteration 880/ 3814 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 3.50 | learning rate: 4.724E-04 | global batch size: 512 | lm loss: 6.106934E+00 | loss scale: 8192.0 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.245 | TFLOPs: 20.53 | +[default7]: iteration 890/ 3814 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 3.50 | learning rate: 4.713E-04 | global batch size: 512 | lm loss: 6.081207E+00 | loss scale: 8192.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.229 | TFLOPs: 20.53 | +[default7]: iteration 900/ 3814 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 3.50 | learning rate: 4.702E-04 | global batch size: 512 | lm loss: 6.078551E+00 | loss scale: 8192.0 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.320 | TFLOPs: 20.54 | +[default7]: iteration 910/ 3814 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 3.50 | learning rate: 4.692E-04 | global batch size: 512 | lm loss: 6.058734E+00 | loss scale: 8192.0 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.333 | TFLOPs: 20.54 | +[default7]: iteration 920/ 3814 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 3.50 | learning rate: 4.681E-04 | global batch size: 512 | lm loss: 6.039846E+00 | loss scale: 8192.0 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.309 | TFLOPs: 20.54 | +[default7]: iteration 930/ 3814 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 3.50 | learning rate: 4.669E-04 | global batch size: 512 | lm loss: 6.029015E+00 | loss scale: 8192.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.300 | TFLOPs: 20.54 | +[default7]: iteration 940/ 3814 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 3.50 | learning rate: 4.658E-04 | global batch size: 512 | lm loss: 6.009462E+00 | loss scale: 8192.0 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.303 | TFLOPs: 20.54 | +[default7]: iteration 950/ 3814 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 3.50 | learning rate: 4.646E-04 | global batch size: 512 | lm loss: 6.008062E+00 | loss scale: 8192.0 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.290 | TFLOPs: 20.54 | +[default7]: iteration 960/ 3814 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 3.50 | learning rate: 4.635E-04 | global batch size: 512 | lm loss: 5.982338E+00 | loss scale: 8192.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 970/ 3814 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 3.50 | learning rate: 4.623E-04 | global batch size: 512 | lm loss: 5.966544E+00 | loss scale: 8192.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.305 | TFLOPs: 20.54 | +[default7]: iteration 980/ 3814 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 3.50 | learning rate: 4.611E-04 | global batch size: 512 | lm loss: 5.960020E+00 | loss scale: 8192.0 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.272 | TFLOPs: 20.54 | +[default7]: iteration 990/ 3814 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 3.50 | learning rate: 4.598E-04 | global batch size: 512 | lm loss: 5.942339E+00 | loss scale: 8192.0 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.295 | TFLOPs: 20.54 | +[default7]: iteration 1000/ 3814 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 3.50 | learning rate: 4.586E-04 | global batch size: 512 | lm loss: 5.926682E+00 | loss scale: 16384.0 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.166 | TFLOPs: 20.52 | +[default7]:------------------------------------------------------------------------------------------ +[default7]:valid loss at iteration 1000 | lm loss value: 5.883473E+00 | lm loss PPL: 3.590540E+02 | +[default7]:------------------------------------------------------------------------------------------ +[default0]:saving checkpoint at iteration 1000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-03 08:46:44,702] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +[default0]:[2023-02-03 08:46:44,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_01-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_01-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,746] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_04-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_04-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_05-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_05-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_06-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_06-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_07-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_07-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,759] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_08-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_08-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_09-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_09-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_10-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_10-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,767] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_11-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_11-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_12-model_00-model_states.pt... +[default2]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default6]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2023-02-03 08:46:44,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_12-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_13-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_13-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,775] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_14-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_14-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_15-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_15-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_16-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_16-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,782] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_17-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_17-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,784] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_18-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_18-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_19-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_19-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_20-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_20-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,792] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_21-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_21-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_22-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_22-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_23-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_23-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_24-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_24-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_25-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_25-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_26-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_26-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,806] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_27-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_27-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_28-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_28-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,810] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_29-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_29-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_30-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_30-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,815] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_31-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_31-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_32-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_32-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_33-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_33-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_34-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_34-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_35-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_35-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_37-model_00-model_states.pt... +[default0]:[2023-02-03 08:46:44,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/layer_37-model_00-model_states.pt. +[default0]:[2023-02-03 08:46:44,828] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/mp_rank_00_model_states.pt +[default0]:[2023-02-03 08:46:44,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/mp_rank_00_model_states.pt... +[default0]:[2023-02-03 08:46:44,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/mp_rank_00_model_states.pt. +[default0]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2023-02-03 08:46:44,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default1]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default3]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default7]:[2023-02-03 08:46:44,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default2]:[2023-02-03 08:46:44,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2023-02-03 08:46:44,927] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default2]:[2023-02-03 08:46:44,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default6]:[2023-02-03 08:46:44,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2023-02-03 08:46:44,919] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2023-02-03 08:46:44,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default5]:[2023-02-03 08:46:44,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2023-02-03 08:46:44,930] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default5]:[2023-02-03 08:46:44,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]:[2023-02-03 08:46:44,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-03 08:46:44,918] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-03 08:46:44,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]: successfully saved checkpoint at iteration 1000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default4]:[2023-02-03 08:46:44,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2023-02-03 08:46:44,932] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2023-02-03 08:46:44,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default1]:[2023-02-03 08:46:44,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2023-02-03 08:46:44,920] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2023-02-03 08:46:44,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default3]:[2023-02-03 08:46:44,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2023-02-03 08:46:44,922] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2023-02-03 08:46:44,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:[2023-02-03 08:46:44,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2023-02-03 08:46:44,929] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step1000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default7]:[2023-02-03 08:46:44,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:time (ms) | save-checkpoint: 236.28 +[default7]: iteration 1010/ 3814 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 13.51 | learning rate: 4.573E-04 | global batch size: 512 | lm loss: 5.933245E+00 | loss scale: 16384.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 37.887 | TFLOPs: 5.32 | +[default7]: iteration 1020/ 3814 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 3.50 | learning rate: 4.561E-04 | global batch size: 512 | lm loss: 5.908319E+00 | loss scale: 16384.0 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.170 | TFLOPs: 20.52 | +[default7]: iteration 1030/ 3814 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 3.50 | learning rate: 4.548E-04 | global batch size: 512 | lm loss: 5.890742E+00 | loss scale: 16384.0 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.193 | TFLOPs: 20.52 | +[default7]: iteration 1040/ 3814 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 3.50 | learning rate: 4.535E-04 | global batch size: 512 | lm loss: 5.888058E+00 | loss scale: 16384.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.227 | TFLOPs: 20.53 | +[default7]: iteration 1050/ 3814 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 3.50 | learning rate: 4.521E-04 | global batch size: 512 | lm loss: 5.871204E+00 | loss scale: 16384.0 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.179 | TFLOPs: 20.52 | +[default7]: iteration 1060/ 3814 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 3.50 | learning rate: 4.508E-04 | global batch size: 512 | lm loss: 5.863955E+00 | loss scale: 16384.0 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.197 | TFLOPs: 20.52 | +[default7]: iteration 1070/ 3814 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 3.50 | learning rate: 4.494E-04 | global batch size: 512 | lm loss: 5.851745E+00 | loss scale: 16384.0 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.189 | TFLOPs: 20.52 | +[default7]: iteration 1080/ 3814 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 3.50 | learning rate: 4.480E-04 | global batch size: 512 | lm loss: 5.832874E+00 | loss scale: 16384.0 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.303 | TFLOPs: 20.54 | +[default7]: iteration 1090/ 3814 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 3.50 | learning rate: 4.466E-04 | global batch size: 512 | lm loss: 5.824191E+00 | loss scale: 16384.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.305 | TFLOPs: 20.54 | +[default7]: iteration 1100/ 3814 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 3.50 | learning rate: 4.452E-04 | global batch size: 512 | lm loss: 5.806806E+00 | loss scale: 16384.0 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.323 | TFLOPs: 20.54 | +[default7]: iteration 1110/ 3814 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 3.50 | learning rate: 4.438E-04 | global batch size: 512 | lm loss: 5.803590E+00 | loss scale: 16384.0 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.308 | TFLOPs: 20.54 | +[default7]: iteration 1120/ 3814 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 3.50 | learning rate: 4.424E-04 | global batch size: 512 | lm loss: 5.789856E+00 | loss scale: 16384.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.291 | TFLOPs: 20.54 | +[default7]: iteration 1130/ 3814 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 3.50 | learning rate: 4.409E-04 | global batch size: 512 | lm loss: 5.776485E+00 | loss scale: 16384.0 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.211 | TFLOPs: 20.53 | +[default7]: iteration 1140/ 3814 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 3.50 | learning rate: 4.394E-04 | global batch size: 512 | lm loss: 5.772342E+00 | loss scale: 16384.0 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.221 | TFLOPs: 20.53 | +[default7]: iteration 1150/ 3814 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 3.50 | learning rate: 4.379E-04 | global batch size: 512 | lm loss: 5.754970E+00 | loss scale: 16384.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.262 | TFLOPs: 20.53 | +[default7]: iteration 1160/ 3814 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 3.50 | learning rate: 4.364E-04 | global batch size: 512 | lm loss: 5.758207E+00 | loss scale: 16384.0 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.278 | TFLOPs: 20.54 | +[default7]: iteration 1170/ 3814 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 3.50 | learning rate: 4.349E-04 | global batch size: 512 | lm loss: 5.745559E+00 | loss scale: 16384.0 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.334 | TFLOPs: 20.54 | +[default7]: iteration 1180/ 3814 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 3.50 | learning rate: 4.334E-04 | global batch size: 512 | lm loss: 5.730128E+00 | loss scale: 16384.0 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.334 | TFLOPs: 20.54 | +[default7]: iteration 1190/ 3814 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 3.50 | learning rate: 4.318E-04 | global batch size: 512 | lm loss: 5.721192E+00 | loss scale: 16384.0 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.329 | TFLOPs: 20.54 | +[default7]: iteration 1200/ 3814 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 3.50 | learning rate: 4.303E-04 | global batch size: 512 | lm loss: 5.710453E+00 | loss scale: 16384.0 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.335 | TFLOPs: 20.54 | +[default7]: iteration 1210/ 3814 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 3.50 | learning rate: 4.287E-04 | global batch size: 512 | lm loss: 5.708004E+00 | loss scale: 16384.0 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.314 | TFLOPs: 20.54 | +[default7]: iteration 1220/ 3814 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 3.50 | learning rate: 4.271E-04 | global batch size: 512 | lm loss: 5.686392E+00 | loss scale: 16384.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.326 | TFLOPs: 20.54 | +[default7]: iteration 1230/ 3814 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 3.50 | learning rate: 4.255E-04 | global batch size: 512 | lm loss: 5.687331E+00 | loss scale: 16384.0 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.315 | TFLOPs: 20.54 | +[default7]: iteration 1240/ 3814 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 3.50 | learning rate: 4.238E-04 | global batch size: 512 | lm loss: 5.672904E+00 | loss scale: 16384.0 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.275 | TFLOPs: 20.54 | +[default7]: iteration 1250/ 3814 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 3.50 | learning rate: 4.222E-04 | global batch size: 512 | lm loss: 5.671751E+00 | loss scale: 16384.0 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.264 | TFLOPs: 20.53 | +[default7]: iteration 1260/ 3814 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 3.50 | learning rate: 4.206E-04 | global batch size: 512 | lm loss: 5.659391E+00 | loss scale: 16384.0 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.280 | TFLOPs: 20.54 | +[default7]: iteration 1270/ 3814 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 3.50 | learning rate: 4.189E-04 | global batch size: 512 | lm loss: 5.645214E+00 | loss scale: 16384.0 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.290 | TFLOPs: 20.54 | +[default7]: iteration 1280/ 3814 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 3.50 | learning rate: 4.172E-04 | global batch size: 512 | lm loss: 5.645622E+00 | loss scale: 16384.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.249 | TFLOPs: 20.53 | +[default7]: iteration 1290/ 3814 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 3.50 | learning rate: 4.155E-04 | global batch size: 512 | lm loss: 5.638621E+00 | loss scale: 16384.0 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.324 | TFLOPs: 20.54 | +[default7]: iteration 1300/ 3814 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 3.50 | learning rate: 4.138E-04 | global batch size: 512 | lm loss: 5.612315E+00 | loss scale: 16384.0 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.276 | TFLOPs: 20.54 | +[default7]: iteration 1310/ 3814 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 3.50 | learning rate: 4.121E-04 | global batch size: 512 | lm loss: 5.609526E+00 | loss scale: 16384.0 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.261 | TFLOPs: 20.53 | +[default7]: iteration 1320/ 3814 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 3.50 | learning rate: 4.103E-04 | global batch size: 512 | lm loss: 5.602406E+00 | loss scale: 16384.0 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.243 | TFLOPs: 20.53 | +[default7]: iteration 1330/ 3814 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 3.50 | learning rate: 4.086E-04 | global batch size: 512 | lm loss: 5.592095E+00 | loss scale: 16384.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.234 | TFLOPs: 20.53 | +[default7]: iteration 1340/ 3814 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 3.50 | learning rate: 4.068E-04 | global batch size: 512 | lm loss: 5.588692E+00 | loss scale: 16384.0 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.262 | TFLOPs: 20.53 | +[default7]: iteration 1350/ 3814 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 3.50 | learning rate: 4.050E-04 | global batch size: 512 | lm loss: 5.584268E+00 | loss scale: 16384.0 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.235 | TFLOPs: 20.53 | +[default7]: iteration 1360/ 3814 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 3.50 | learning rate: 4.033E-04 | global batch size: 512 | lm loss: 5.578105E+00 | loss scale: 16384.0 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.219 | TFLOPs: 20.53 | +[default7]: iteration 1370/ 3814 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 3.50 | learning rate: 4.015E-04 | global batch size: 512 | lm loss: 5.567706E+00 | loss scale: 16384.0 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.281 | TFLOPs: 20.54 | +[default7]: iteration 1380/ 3814 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 3.50 | learning rate: 3.996E-04 | global batch size: 512 | lm loss: 5.555022E+00 | loss scale: 16384.0 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.253 | TFLOPs: 20.53 | +[default7]: iteration 1390/ 3814 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 3.50 | learning rate: 3.978E-04 | global batch size: 512 | lm loss: 5.544663E+00 | loss scale: 16384.0 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.237 | TFLOPs: 20.53 | +[default7]: iteration 1400/ 3814 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 3.50 | learning rate: 3.960E-04 | global batch size: 512 | lm loss: 5.539321E+00 | loss scale: 16384.0 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.210 | TFLOPs: 20.53 | +[default7]: iteration 1410/ 3814 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 3.50 | learning rate: 3.941E-04 | global batch size: 512 | lm loss: 5.536536E+00 | loss scale: 16384.0 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.261 | TFLOPs: 20.53 | +[default7]: iteration 1420/ 3814 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 3.50 | learning rate: 3.923E-04 | global batch size: 512 | lm loss: 5.523385E+00 | loss scale: 16384.0 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.230 | TFLOPs: 20.53 | +[default7]: iteration 1430/ 3814 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 3.50 | learning rate: 3.904E-04 | global batch size: 512 | lm loss: 5.507019E+00 | loss scale: 16384.0 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.247 | TFLOPs: 20.53 | +[default7]: iteration 1440/ 3814 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 3.50 | learning rate: 3.885E-04 | global batch size: 512 | lm loss: 5.510181E+00 | loss scale: 16384.0 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.231 | TFLOPs: 20.53 | +[default7]: iteration 1450/ 3814 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 3.50 | learning rate: 3.866E-04 | global batch size: 512 | lm loss: 5.502772E+00 | loss scale: 16384.0 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.258 | TFLOPs: 20.53 | +[default7]: iteration 1460/ 3814 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 3.50 | learning rate: 3.847E-04 | global batch size: 512 | lm loss: 5.499038E+00 | loss scale: 16384.0 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.225 | TFLOPs: 20.53 | +[default7]: iteration 1470/ 3814 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 3.50 | learning rate: 3.828E-04 | global batch size: 512 | lm loss: 5.486530E+00 | loss scale: 16384.0 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.274 | TFLOPs: 20.54 | +[default7]: iteration 1480/ 3814 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 3.50 | learning rate: 3.809E-04 | global batch size: 512 | lm loss: 5.482343E+00 | loss scale: 16384.0 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.164 | TFLOPs: 20.52 | +[default7]: iteration 1490/ 3814 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 3.50 | learning rate: 3.789E-04 | global batch size: 512 | lm loss: 5.472359E+00 | loss scale: 16384.0 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.241 | TFLOPs: 20.53 | +[default7]: iteration 1500/ 3814 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 3.50 | learning rate: 3.770E-04 | global batch size: 512 | lm loss: 5.474801E+00 | loss scale: 32768.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.216 | TFLOPs: 20.53 | +[default7]: iteration 1510/ 3814 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 3.50 | learning rate: 3.750E-04 | global batch size: 512 | lm loss: 5.459778E+00 | loss scale: 32768.0 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.180 | TFLOPs: 20.52 | +[default7]: iteration 1520/ 3814 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 3.50 | learning rate: 3.730E-04 | global batch size: 512 | lm loss: 5.450466E+00 | loss scale: 32768.0 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.216 | TFLOPs: 20.53 | +[default7]: iteration 1530/ 3814 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 3.50 | learning rate: 3.710E-04 | global batch size: 512 | lm loss: 5.443768E+00 | loss scale: 32768.0 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.238 | TFLOPs: 20.53 | +[default7]: iteration 1540/ 3814 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 3.50 | learning rate: 3.690E-04 | global batch size: 512 | lm loss: 5.434637E+00 | loss scale: 32768.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.240 | TFLOPs: 20.53 | +[default7]: iteration 1550/ 3814 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 3.50 | learning rate: 3.670E-04 | global batch size: 512 | lm loss: 5.437542E+00 | loss scale: 32768.0 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.240 | TFLOPs: 20.53 | +[default7]: iteration 1560/ 3814 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 3.50 | learning rate: 3.650E-04 | global batch size: 512 | lm loss: 5.427481E+00 | loss scale: 32768.0 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.242 | TFLOPs: 20.53 | +[default7]: iteration 1570/ 3814 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 3.50 | learning rate: 3.630E-04 | global batch size: 512 | lm loss: 5.422375E+00 | loss scale: 32768.0 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.217 | TFLOPs: 20.53 | +[default7]: iteration 1580/ 3814 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 3.50 | learning rate: 3.610E-04 | global batch size: 512 | lm loss: 5.414244E+00 | loss scale: 32768.0 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.223 | TFLOPs: 20.53 | +[default7]: iteration 1590/ 3814 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 3.50 | learning rate: 3.589E-04 | global batch size: 512 | lm loss: 5.401053E+00 | loss scale: 32768.0 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.228 | TFLOPs: 20.53 | +[default7]: iteration 1600/ 3814 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 3.50 | learning rate: 3.569E-04 | global batch size: 512 | lm loss: 5.399310E+00 | loss scale: 32768.0 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.252 | TFLOPs: 20.53 | +[default7]: iteration 1610/ 3814 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 3.50 | learning rate: 3.548E-04 | global batch size: 512 | lm loss: 5.393529E+00 | loss scale: 32768.0 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.269 | TFLOPs: 20.53 | +[default7]: iteration 1620/ 3814 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 3.50 | learning rate: 3.528E-04 | global batch size: 512 | lm loss: 5.383247E+00 | loss scale: 32768.0 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.248 | TFLOPs: 20.53 | +[default7]: iteration 1630/ 3814 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 3.50 | learning rate: 3.507E-04 | global batch size: 512 | lm loss: 5.379195E+00 | loss scale: 32768.0 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.327 | TFLOPs: 20.54 | +[default7]: iteration 1640/ 3814 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 3.50 | learning rate: 3.486E-04 | global batch size: 512 | lm loss: 5.370285E+00 | loss scale: 32768.0 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.319 | TFLOPs: 20.54 | +[default7]: iteration 1650/ 3814 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 3.50 | learning rate: 3.465E-04 | global batch size: 512 | lm loss: 5.363318E+00 | loss scale: 32768.0 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.266 | TFLOPs: 20.53 | +[default7]: iteration 1660/ 3814 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 3.50 | learning rate: 3.444E-04 | global batch size: 512 | lm loss: 5.353438E+00 | loss scale: 32768.0 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.278 | TFLOPs: 20.54 | +[default7]: iteration 1670/ 3814 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 3.50 | learning rate: 3.423E-04 | global batch size: 512 | lm loss: 5.353006E+00 | loss scale: 32768.0 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.275 | TFLOPs: 20.54 | +[default7]: iteration 1680/ 3814 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 3.50 | learning rate: 3.402E-04 | global batch size: 512 | lm loss: 5.346856E+00 | loss scale: 32768.0 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.270 | TFLOPs: 20.54 | +[default7]: iteration 1690/ 3814 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 3.50 | learning rate: 3.381E-04 | global batch size: 512 | lm loss: 5.346691E+00 | loss scale: 32768.0 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.233 | TFLOPs: 20.53 | +[default7]: iteration 1700/ 3814 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 3.50 | learning rate: 3.359E-04 | global batch size: 512 | lm loss: 5.341690E+00 | loss scale: 32768.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.243 | TFLOPs: 20.53 | +[default7]: iteration 1710/ 3814 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 3.50 | learning rate: 3.338E-04 | global batch size: 512 | lm loss: 5.327769E+00 | loss scale: 32768.0 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.220 | TFLOPs: 20.53 | +[default7]: iteration 1720/ 3814 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 3.50 | learning rate: 3.317E-04 | global batch size: 512 | lm loss: 5.322400E+00 | loss scale: 32768.0 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.219 | TFLOPs: 20.53 | +[default7]: iteration 1730/ 3814 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 3.50 | learning rate: 3.295E-04 | global batch size: 512 | lm loss: 5.313205E+00 | loss scale: 32768.0 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.228 | TFLOPs: 20.53 | +[default7]: iteration 1740/ 3814 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 3.50 | learning rate: 3.274E-04 | global batch size: 512 | lm loss: 5.308377E+00 | loss scale: 32768.0 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.301 | TFLOPs: 20.54 | +[default7]: iteration 1750/ 3814 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 3.50 | learning rate: 3.252E-04 | global batch size: 512 | lm loss: 5.306791E+00 | loss scale: 32768.0 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.343 | TFLOPs: 20.55 | +[default7]: iteration 1760/ 3814 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 3.50 | learning rate: 3.230E-04 | global batch size: 512 | lm loss: 5.303168E+00 | loss scale: 32768.0 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.323 | TFLOPs: 20.54 | +[default7]: iteration 1770/ 3814 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 3.50 | learning rate: 3.208E-04 | global batch size: 512 | lm loss: 5.303556E+00 | loss scale: 32768.0 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.342 | TFLOPs: 20.55 | +[default7]: iteration 1780/ 3814 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 3.50 | learning rate: 3.187E-04 | global batch size: 512 | lm loss: 5.288455E+00 | loss scale: 32768.0 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.340 | TFLOPs: 20.54 | +[default7]: iteration 1790/ 3814 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 3.50 | learning rate: 3.165E-04 | global batch size: 512 | lm loss: 5.285187E+00 | loss scale: 32768.0 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.313 | TFLOPs: 20.54 | +[default7]: iteration 1800/ 3814 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 3.50 | learning rate: 3.143E-04 | global batch size: 512 | lm loss: 5.283298E+00 | loss scale: 32768.0 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 1810/ 3814 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 3.50 | learning rate: 3.121E-04 | global batch size: 512 | lm loss: 5.268293E+00 | loss scale: 32768.0 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.115 | TFLOPs: 20.51 | +[default7]: iteration 1820/ 3814 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 3.50 | learning rate: 3.099E-04 | global batch size: 512 | lm loss: 5.268316E+00 | loss scale: 32768.0 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.264 | TFLOPs: 20.53 | +[default7]: iteration 1830/ 3814 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 3.50 | learning rate: 3.077E-04 | global batch size: 512 | lm loss: 5.256224E+00 | loss scale: 32768.0 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.281 | TFLOPs: 20.54 | +[default7]: iteration 1840/ 3814 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 3.50 | learning rate: 3.055E-04 | global batch size: 512 | lm loss: 5.261451E+00 | loss scale: 32768.0 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.303 | TFLOPs: 20.54 | +[default7]: iteration 1850/ 3814 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 3.50 | learning rate: 3.032E-04 | global batch size: 512 | lm loss: 5.245650E+00 | loss scale: 32768.0 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.241 | TFLOPs: 20.53 | +[default7]: iteration 1860/ 3814 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 3.50 | learning rate: 3.010E-04 | global batch size: 512 | lm loss: 5.243266E+00 | loss scale: 32768.0 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.357 | TFLOPs: 20.55 | +[default7]: iteration 1870/ 3814 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 3.50 | learning rate: 2.988E-04 | global batch size: 512 | lm loss: 5.234014E+00 | loss scale: 32768.0 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.373 | TFLOPs: 20.55 | +[default7]: iteration 1880/ 3814 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 3.50 | learning rate: 2.966E-04 | global batch size: 512 | lm loss: 5.238946E+00 | loss scale: 32768.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.288 | TFLOPs: 20.54 | +[default7]: iteration 1890/ 3814 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 3.50 | learning rate: 2.943E-04 | global batch size: 512 | lm loss: 5.232159E+00 | loss scale: 32768.0 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.285 | TFLOPs: 20.54 | +[default7]: iteration 1900/ 3814 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 3.50 | learning rate: 2.921E-04 | global batch size: 512 | lm loss: 5.230960E+00 | loss scale: 32768.0 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.290 | TFLOPs: 20.54 | +[default7]: iteration 1910/ 3814 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 3.50 | learning rate: 2.899E-04 | global batch size: 512 | lm loss: 5.228464E+00 | loss scale: 32768.0 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 1920/ 3814 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 3.50 | learning rate: 2.876E-04 | global batch size: 512 | lm loss: 5.205968E+00 | loss scale: 32768.0 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.316 | TFLOPs: 20.54 | +[default7]: iteration 1930/ 3814 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 3.50 | learning rate: 2.854E-04 | global batch size: 512 | lm loss: 5.217755E+00 | loss scale: 32768.0 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.312 | TFLOPs: 20.54 | +[default7]: iteration 1940/ 3814 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 3.50 | learning rate: 2.831E-04 | global batch size: 512 | lm loss: 5.208467E+00 | loss scale: 32768.0 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.311 | TFLOPs: 20.54 | +[default7]: iteration 1950/ 3814 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 3.50 | learning rate: 2.809E-04 | global batch size: 512 | lm loss: 5.203500E+00 | loss scale: 32768.0 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.332 | TFLOPs: 20.54 | +[default7]: iteration 1960/ 3814 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 3.50 | learning rate: 2.786E-04 | global batch size: 512 | lm loss: 5.204367E+00 | loss scale: 32768.0 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.319 | TFLOPs: 20.54 | +[default7]: iteration 1970/ 3814 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 3.50 | learning rate: 2.763E-04 | global batch size: 512 | lm loss: 5.182655E+00 | loss scale: 32768.0 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.354 | TFLOPs: 20.55 | +[default7]: iteration 1980/ 3814 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 3.50 | learning rate: 2.741E-04 | global batch size: 512 | lm loss: 5.190867E+00 | loss scale: 32768.0 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.299 | TFLOPs: 20.54 | +[default7]: iteration 1990/ 3814 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 3.50 | learning rate: 2.718E-04 | global batch size: 512 | lm loss: 5.184834E+00 | loss scale: 32768.0 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.305 | TFLOPs: 20.54 | +[default0]:[2023-02-03 09:45:05,367] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00026955483427458683, 0.0005391096685491737, 0.00026955483427458683, 0.0005391096685491737], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default7]: iteration 2000/ 3814 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 3.50 | learning rate: 2.696E-04 | global batch size: 512 | lm loss: 5.175253E+00 | loss scale: 65536.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.277 | TFLOPs: 20.54 | +[default0]:steps: 2000 loss: 5.1656 iter time (s): 3.496 samples/sec: 146.435 +[default0]:saving checkpoint at iteration 2000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-03 09:46:44,922] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +[default0]:[2023-02-03 09:46:44,925] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_01-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_01-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_04-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_04-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_05-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_05-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_06-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_06-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_07-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_07-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_08-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_08-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_09-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_09-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_10-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_10-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_11-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_11-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_12-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_12-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_13-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_13-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_14-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_14-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_15-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_15-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_16-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_16-model_00-model_states.pt. +[default7]:------------------------------------------------------------------------------------------ +[default7]:valid loss at iteration 2000 | lm loss value: 5.101256E+00 | lm loss PPL: 1.642280E+02 | +[default7]:------------------------------------------------------------------------------------------ +[default3]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2023-02-03 09:46:44,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_17-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_17-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,991] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_18-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_18-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_19-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_19-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_20-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:44,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_20-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:44,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_21-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_21-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_22-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_22-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_23-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_23-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_24-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_24-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_25-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_25-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,010] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_26-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_26-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_27-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_27-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,016] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_28-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_28-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_29-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_29-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_30-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_30-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,023] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_31-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_31-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_32-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_32-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_33-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_33-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_34-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_34-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_35-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_35-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,035] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_37-model_00-model_states.pt... +[default0]:[2023-02-03 09:46:45,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/layer_37-model_00-model_states.pt. +[default0]:[2023-02-03 09:46:45,037] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/mp_rank_00_model_states.pt +[default0]:[2023-02-03 09:46:45,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/mp_rank_00_model_states.pt... +[default0]:[2023-02-03 09:46:45,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/mp_rank_00_model_states.pt. +[default0]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default5]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default1]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default4]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default7]:[2023-02-03 09:46:45,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default4]:[2023-02-03 09:46:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2023-02-03 09:46:45,111] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default7]:[2023-02-03 09:46:45,107] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default7]:[2023-02-03 09:46:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default4]:[2023-02-03 09:46:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default6]:[2023-02-03 09:46:45,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default6]:[2023-02-03 09:46:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2023-02-03 09:46:45,109] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2023-02-03 09:46:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default3]:[2023-02-03 09:46:45,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2023-02-03 09:46:45,117] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2023-02-03 09:46:45,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]:[2023-02-03 09:46:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-03 09:46:45,113] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-03 09:46:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]: successfully saved checkpoint at iteration 2000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default2]:[2023-02-03 09:46:45,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2023-02-03 09:46:45,113] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default2]:[2023-02-03 09:46:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default5]:[2023-02-03 09:46:45,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2023-02-03 09:46:45,113] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default5]:[2023-02-03 09:46:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default1]:[2023-02-03 09:46:45,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2023-02-03 09:46:45,115] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step2000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2023-02-03 09:46:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]:time (ms) | save-checkpoint: 197.76 +[default7]: iteration 2010/ 3814 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 13.48 | learning rate: 2.673E-04 | global batch size: 512 | lm loss: 5.180109E+00 | loss scale: 65536.0 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 37.979 | TFLOPs: 5.33 | +[default7]: iteration 2020/ 3814 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 3.50 | learning rate: 2.650E-04 | global batch size: 512 | lm loss: 5.177064E+00 | loss scale: 65536.0 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.121 | TFLOPs: 20.51 | +[default7]: iteration 2030/ 3814 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 3.50 | learning rate: 2.628E-04 | global batch size: 512 | lm loss: 5.167480E+00 | loss scale: 65536.0 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.117 | TFLOPs: 20.51 | +[default7]: iteration 2040/ 3814 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 3.50 | learning rate: 2.605E-04 | global batch size: 512 | lm loss: 5.162463E+00 | loss scale: 65536.0 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.084 | TFLOPs: 20.51 | +[default7]: iteration 2050/ 3814 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 3.50 | learning rate: 2.582E-04 | global batch size: 512 | lm loss: 5.157613E+00 | loss scale: 65536.0 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.201 | TFLOPs: 20.53 | +[default7]: iteration 2060/ 3814 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 3.50 | learning rate: 2.559E-04 | global batch size: 512 | lm loss: 5.151617E+00 | loss scale: 65536.0 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.312 | TFLOPs: 20.54 | +[default7]: iteration 2070/ 3814 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 3.50 | learning rate: 2.537E-04 | global batch size: 512 | lm loss: 5.143462E+00 | loss scale: 65536.0 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.310 | TFLOPs: 20.54 | +[default7]: iteration 2080/ 3814 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 3.50 | learning rate: 2.514E-04 | global batch size: 512 | lm loss: 5.153197E+00 | loss scale: 65536.0 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.308 | TFLOPs: 20.54 | +[default7]: iteration 2090/ 3814 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 3.50 | learning rate: 2.491E-04 | global batch size: 512 | lm loss: 5.132853E+00 | loss scale: 65536.0 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.403 | TFLOPs: 20.55 | +[default7]: iteration 2100/ 3814 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 3.50 | learning rate: 2.469E-04 | global batch size: 512 | lm loss: 5.129685E+00 | loss scale: 65536.0 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.357 | TFLOPs: 20.55 | +[default7]: iteration 2110/ 3814 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 3.50 | learning rate: 2.446E-04 | global batch size: 512 | lm loss: 5.137356E+00 | loss scale: 65536.0 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.324 | TFLOPs: 20.54 | +[default7]: iteration 2120/ 3814 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 3.50 | learning rate: 2.423E-04 | global batch size: 512 | lm loss: 5.133213E+00 | loss scale: 65536.0 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.309 | TFLOPs: 20.54 | +[default7]: iteration 2130/ 3814 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 3.50 | learning rate: 2.400E-04 | global batch size: 512 | lm loss: 5.125665E+00 | loss scale: 65536.0 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.321 | TFLOPs: 20.54 | +[default7]: iteration 2140/ 3814 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 3.50 | learning rate: 2.378E-04 | global batch size: 512 | lm loss: 5.120090E+00 | loss scale: 65536.0 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.323 | TFLOPs: 20.54 | +[default7]: iteration 2150/ 3814 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 3.50 | learning rate: 2.355E-04 | global batch size: 512 | lm loss: 5.116153E+00 | loss scale: 65536.0 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.312 | TFLOPs: 20.54 | +[default7]: iteration 2160/ 3814 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 3.50 | learning rate: 2.332E-04 | global batch size: 512 | lm loss: 5.110509E+00 | loss scale: 65536.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 2170/ 3814 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 3.50 | learning rate: 2.310E-04 | global batch size: 512 | lm loss: 5.107160E+00 | loss scale: 65536.0 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.337 | TFLOPs: 20.54 | +[default7]: iteration 2180/ 3814 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 3.50 | learning rate: 2.287E-04 | global batch size: 512 | lm loss: 5.110184E+00 | loss scale: 65536.0 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.353 | TFLOPs: 20.55 | +[default7]: iteration 2190/ 3814 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 3.50 | learning rate: 2.264E-04 | global batch size: 512 | lm loss: 5.099596E+00 | loss scale: 65536.0 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.383 | TFLOPs: 20.55 | +[default7]: iteration 2200/ 3814 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 3.50 | learning rate: 2.242E-04 | global batch size: 512 | lm loss: 5.100008E+00 | loss scale: 65536.0 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.336 | TFLOPs: 20.54 | +[default7]: iteration 2210/ 3814 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 3.50 | learning rate: 2.219E-04 | global batch size: 512 | lm loss: 5.097319E+00 | loss scale: 65536.0 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.364 | TFLOPs: 20.55 | +[default7]: iteration 2220/ 3814 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 3.50 | learning rate: 2.197E-04 | global batch size: 512 | lm loss: 5.088415E+00 | loss scale: 65536.0 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.302 | TFLOPs: 20.54 | +[default7]: iteration 2230/ 3814 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 3.50 | learning rate: 2.174E-04 | global batch size: 512 | lm loss: 5.094426E+00 | loss scale: 65536.0 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.254 | TFLOPs: 20.53 | +[default7]: iteration 2240/ 3814 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 3.50 | learning rate: 2.152E-04 | global batch size: 512 | lm loss: 5.078577E+00 | loss scale: 65536.0 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.279 | TFLOPs: 20.54 | +[default7]: iteration 2250/ 3814 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 3.50 | learning rate: 2.129E-04 | global batch size: 512 | lm loss: 5.069790E+00 | loss scale: 65536.0 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.277 | TFLOPs: 20.54 | +[default7]: iteration 2260/ 3814 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 3.50 | learning rate: 2.107E-04 | global batch size: 512 | lm loss: 5.067738E+00 | loss scale: 65536.0 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.274 | TFLOPs: 20.54 | +[default7]: iteration 2270/ 3814 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 3.50 | learning rate: 2.084E-04 | global batch size: 512 | lm loss: 5.070375E+00 | loss scale: 65536.0 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.282 | TFLOPs: 20.54 | +[default7]: iteration 2280/ 3814 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 3.50 | learning rate: 2.062E-04 | global batch size: 512 | lm loss: 5.074162E+00 | loss scale: 65536.0 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.310 | TFLOPs: 20.54 | +[default7]: iteration 2290/ 3814 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 3.50 | learning rate: 2.040E-04 | global batch size: 512 | lm loss: 5.069159E+00 | loss scale: 65536.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.303 | TFLOPs: 20.54 | +[default7]: iteration 2300/ 3814 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 3.50 | learning rate: 2.017E-04 | global batch size: 512 | lm loss: 5.065920E+00 | loss scale: 65536.0 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.298 | TFLOPs: 20.54 | +[default7]: iteration 2310/ 3814 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 3.50 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 5.068484E+00 | loss scale: 65536.0 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.271 | TFLOPs: 20.54 | +[default7]: iteration 2320/ 3814 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 3.50 | learning rate: 1.973E-04 | global batch size: 512 | lm loss: 5.051132E+00 | loss scale: 65536.0 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.307 | TFLOPs: 20.54 | +[default7]: iteration 2330/ 3814 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 3.50 | learning rate: 1.951E-04 | global batch size: 512 | lm loss: 5.055693E+00 | loss scale: 65536.0 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.284 | TFLOPs: 20.54 | +[default7]: iteration 2340/ 3814 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 3.50 | learning rate: 1.928E-04 | global batch size: 512 | lm loss: 5.051280E+00 | loss scale: 65536.0 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.289 | TFLOPs: 20.54 | +[default7]: iteration 2350/ 3814 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 3.50 | learning rate: 1.906E-04 | global batch size: 512 | lm loss: 5.046793E+00 | loss scale: 65536.0 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.295 | TFLOPs: 20.54 | +[default7]: iteration 2360/ 3814 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 3.50 | learning rate: 1.884E-04 | global batch size: 512 | lm loss: 5.040703E+00 | loss scale: 65536.0 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.289 | TFLOPs: 20.54 | +[default7]: iteration 2370/ 3814 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 3.50 | learning rate: 1.862E-04 | global batch size: 512 | lm loss: 5.051883E+00 | loss scale: 65536.0 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.276 | TFLOPs: 20.54 | +[default7]: iteration 2380/ 3814 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 3.50 | learning rate: 1.840E-04 | global batch size: 512 | lm loss: 5.042635E+00 | loss scale: 65536.0 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.277 | TFLOPs: 20.54 | +[default7]: iteration 2390/ 3814 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 3.50 | learning rate: 1.818E-04 | global batch size: 512 | lm loss: 5.039528E+00 | loss scale: 65536.0 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.262 | TFLOPs: 20.53 | +[default7]: iteration 2400/ 3814 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 3.50 | learning rate: 1.797E-04 | global batch size: 512 | lm loss: 5.037461E+00 | loss scale: 65536.0 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.306 | TFLOPs: 20.54 | +[default7]: iteration 2410/ 3814 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 3.50 | learning rate: 1.775E-04 | global batch size: 512 | lm loss: 5.025623E+00 | loss scale: 65536.0 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.311 | TFLOPs: 20.54 | +[default7]: iteration 2420/ 3814 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 3.50 | learning rate: 1.753E-04 | global batch size: 512 | lm loss: 5.029846E+00 | loss scale: 65536.0 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.321 | TFLOPs: 20.54 | +[default7]: iteration 2430/ 3814 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 3.50 | learning rate: 1.731E-04 | global batch size: 512 | lm loss: 5.015889E+00 | loss scale: 65536.0 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.364 | TFLOPs: 20.55 | +[default7]: iteration 2440/ 3814 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 3.50 | learning rate: 1.710E-04 | global batch size: 512 | lm loss: 5.022284E+00 | loss scale: 65536.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.366 | TFLOPs: 20.55 | +[default7]: iteration 2450/ 3814 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 3.50 | learning rate: 1.688E-04 | global batch size: 512 | lm loss: 5.016843E+00 | loss scale: 65536.0 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.342 | TFLOPs: 20.55 | +[default7]: iteration 2460/ 3814 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 3.50 | learning rate: 1.667E-04 | global batch size: 512 | lm loss: 5.022736E+00 | loss scale: 65536.0 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.293 | TFLOPs: 20.54 | +[default7]: iteration 2470/ 3814 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 3.50 | learning rate: 1.646E-04 | global batch size: 512 | lm loss: 5.017162E+00 | loss scale: 65536.0 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.347 | TFLOPs: 20.55 | +[default7]: iteration 2480/ 3814 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 3.50 | learning rate: 1.624E-04 | global batch size: 512 | lm loss: 5.011271E+00 | loss scale: 65536.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.340 | TFLOPs: 20.54 | +[default7]: iteration 2490/ 3814 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 3.50 | learning rate: 1.603E-04 | global batch size: 512 | lm loss: 5.008818E+00 | loss scale: 65536.0 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.327 | TFLOPs: 20.54 | +[default7]: iteration 2500/ 3814 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 3.50 | learning rate: 1.582E-04 | global batch size: 512 | lm loss: 5.009525E+00 | loss scale: 131072.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.289 | TFLOPs: 20.54 | +[default7]: iteration 2510/ 3814 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 3.50 | learning rate: 1.561E-04 | global batch size: 512 | lm loss: 5.005350E+00 | loss scale: 131072.0 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.265 | TFLOPs: 20.53 | +[default7]: iteration 2520/ 3814 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 3.50 | learning rate: 1.540E-04 | global batch size: 512 | lm loss: 5.009430E+00 | loss scale: 131072.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.320 | TFLOPs: 20.54 | +[default7]: iteration 2530/ 3814 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 3.50 | learning rate: 1.519E-04 | global batch size: 512 | lm loss: 5.003883E+00 | loss scale: 131072.0 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.332 | TFLOPs: 20.54 | +[default7]: iteration 2540/ 3814 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 3.50 | learning rate: 1.498E-04 | global batch size: 512 | lm loss: 5.005626E+00 | loss scale: 131072.0 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.353 | TFLOPs: 20.55 | +[default7]: iteration 2550/ 3814 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 3.50 | learning rate: 1.477E-04 | global batch size: 512 | lm loss: 4.995838E+00 | loss scale: 131072.0 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.342 | TFLOPs: 20.55 | +[default7]: iteration 2560/ 3814 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 3.50 | learning rate: 1.456E-04 | global batch size: 512 | lm loss: 4.987294E+00 | loss scale: 131072.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.305 | TFLOPs: 20.54 | +[default7]: iteration 2570/ 3814 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 3.50 | learning rate: 1.436E-04 | global batch size: 512 | lm loss: 4.989835E+00 | loss scale: 131072.0 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.318 | TFLOPs: 20.54 | +[default7]: iteration 2580/ 3814 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 3.50 | learning rate: 1.415E-04 | global batch size: 512 | lm loss: 4.996525E+00 | loss scale: 131072.0 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.329 | TFLOPs: 20.54 | +[default7]: iteration 2590/ 3814 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 3.50 | learning rate: 1.395E-04 | global batch size: 512 | lm loss: 4.981519E+00 | loss scale: 131072.0 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.330 | TFLOPs: 20.54 | +[default7]: iteration 2600/ 3814 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 3.50 | learning rate: 1.375E-04 | global batch size: 512 | lm loss: 4.988669E+00 | loss scale: 131072.0 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.335 | TFLOPs: 20.54 | +[default7]: iteration 2610/ 3814 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 3.50 | learning rate: 1.354E-04 | global batch size: 512 | lm loss: 4.978912E+00 | loss scale: 131072.0 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.367 | TFLOPs: 20.55 | +[default7]: iteration 2620/ 3814 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 3.50 | learning rate: 1.334E-04 | global batch size: 512 | lm loss: 4.976591E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.243 | TFLOPs: 20.53 | +[default7]: iteration 2630/ 3814 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 3.50 | learning rate: 1.314E-04 | global batch size: 512 | lm loss: 4.975288E+00 | loss scale: 131072.0 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.324 | TFLOPs: 20.54 | +[default7]: iteration 2640/ 3814 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 3.50 | learning rate: 1.294E-04 | global batch size: 512 | lm loss: 4.975443E+00 | loss scale: 131072.0 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.347 | TFLOPs: 20.55 | +[default7]: iteration 2650/ 3814 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 3.50 | learning rate: 1.274E-04 | global batch size: 512 | lm loss: 4.962476E+00 | loss scale: 131072.0 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.337 | TFLOPs: 20.54 | +[default7]: iteration 2660/ 3814 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 3.50 | learning rate: 1.255E-04 | global batch size: 512 | lm loss: 4.972200E+00 | loss scale: 131072.0 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.330 | TFLOPs: 20.54 | +[default7]: iteration 2670/ 3814 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 3.50 | learning rate: 1.235E-04 | global batch size: 512 | lm loss: 4.963987E+00 | loss scale: 131072.0 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.341 | TFLOPs: 20.55 | +[default7]: iteration 2680/ 3814 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 3.50 | learning rate: 1.215E-04 | global batch size: 512 | lm loss: 4.966713E+00 | loss scale: 131072.0 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.349 | TFLOPs: 20.55 | +[default7]: iteration 2690/ 3814 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 3.50 | learning rate: 1.196E-04 | global batch size: 512 | lm loss: 4.954952E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.339 | TFLOPs: 20.54 | +[default7]: iteration 2700/ 3814 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 3.50 | learning rate: 1.177E-04 | global batch size: 512 | lm loss: 4.971128E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.322 | TFLOPs: 20.54 | +[default7]: iteration 2710/ 3814 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 3.50 | learning rate: 1.157E-04 | global batch size: 512 | lm loss: 4.962166E+00 | loss scale: 131072.0 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 2720/ 3814 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 3.50 | learning rate: 1.138E-04 | global batch size: 512 | lm loss: 4.963102E+00 | loss scale: 131072.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.364 | TFLOPs: 20.55 | +[default7]: iteration 2730/ 3814 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 3.50 | learning rate: 1.119E-04 | global batch size: 512 | lm loss: 4.956701E+00 | loss scale: 131072.0 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.292 | TFLOPs: 20.54 | +[default7]: iteration 2740/ 3814 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 3.50 | learning rate: 1.100E-04 | global batch size: 512 | lm loss: 4.953621E+00 | loss scale: 131072.0 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.420 | TFLOPs: 20.56 | +[default7]: iteration 2750/ 3814 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 3.50 | learning rate: 1.082E-04 | global batch size: 512 | lm loss: 4.948414E+00 | loss scale: 131072.0 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.417 | TFLOPs: 20.56 | +[default7]: iteration 2760/ 3814 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 3.50 | learning rate: 1.063E-04 | global batch size: 512 | lm loss: 4.944056E+00 | loss scale: 131072.0 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.367 | TFLOPs: 20.55 | +[default7]: iteration 2770/ 3814 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 3.50 | learning rate: 1.044E-04 | global batch size: 512 | lm loss: 4.944813E+00 | loss scale: 131072.0 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.419 | TFLOPs: 20.56 | +[default7]: iteration 2780/ 3814 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 3.50 | learning rate: 1.026E-04 | global batch size: 512 | lm loss: 4.952984E+00 | loss scale: 131072.0 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.403 | TFLOPs: 20.55 | +[default7]: iteration 2790/ 3814 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 3.50 | learning rate: 1.008E-04 | global batch size: 512 | lm loss: 4.946525E+00 | loss scale: 131072.0 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.406 | TFLOPs: 20.55 | +[default7]: iteration 2800/ 3814 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 3.50 | learning rate: 9.896E-05 | global batch size: 512 | lm loss: 4.946445E+00 | loss scale: 131072.0 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.337 | TFLOPs: 20.54 | +[default7]: iteration 2810/ 3814 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 3.50 | learning rate: 9.716E-05 | global batch size: 512 | lm loss: 4.943497E+00 | loss scale: 131072.0 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.313 | TFLOPs: 20.54 | +[default7]: iteration 2820/ 3814 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 3.50 | learning rate: 9.537E-05 | global batch size: 512 | lm loss: 4.946733E+00 | loss scale: 131072.0 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.360 | TFLOPs: 20.55 | +[default7]: iteration 2830/ 3814 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 3.50 | learning rate: 9.359E-05 | global batch size: 512 | lm loss: 4.940904E+00 | loss scale: 131072.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.334 | TFLOPs: 20.54 | +[default7]: iteration 2840/ 3814 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 3.50 | learning rate: 9.182E-05 | global batch size: 512 | lm loss: 4.932193E+00 | loss scale: 131072.0 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.349 | TFLOPs: 20.55 | +[default7]: iteration 2850/ 3814 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 3.50 | learning rate: 9.007E-05 | global batch size: 512 | lm loss: 4.938750E+00 | loss scale: 131072.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.417 | TFLOPs: 20.56 | +[default7]: iteration 2860/ 3814 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 3.50 | learning rate: 8.833E-05 | global batch size: 512 | lm loss: 4.942112E+00 | loss scale: 131072.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.392 | TFLOPs: 20.55 | +[default7]: iteration 2870/ 3814 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 3.50 | learning rate: 8.660E-05 | global batch size: 512 | lm loss: 4.932373E+00 | loss scale: 131072.0 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.336 | TFLOPs: 20.54 | +[default7]: iteration 2880/ 3814 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 3.50 | learning rate: 8.489E-05 | global batch size: 512 | lm loss: 4.940631E+00 | loss scale: 131072.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.403 | TFLOPs: 20.55 | +[default7]: iteration 2890/ 3814 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 3.50 | learning rate: 8.319E-05 | global batch size: 512 | lm loss: 4.931613E+00 | loss scale: 131072.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.389 | TFLOPs: 20.55 | +[default7]: iteration 2900/ 3814 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 3.50 | learning rate: 8.151E-05 | global batch size: 512 | lm loss: 4.936777E+00 | loss scale: 131072.0 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.389 | TFLOPs: 20.55 | +[default7]: iteration 2910/ 3814 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 3.50 | learning rate: 7.984E-05 | global batch size: 512 | lm loss: 4.937520E+00 | loss scale: 131072.0 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.380 | TFLOPs: 20.55 | +[default7]: iteration 2920/ 3814 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 3.50 | learning rate: 7.818E-05 | global batch size: 512 | lm loss: 4.924782E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.381 | TFLOPs: 20.55 | +[default7]: iteration 2930/ 3814 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 3.50 | learning rate: 7.654E-05 | global batch size: 512 | lm loss: 4.930707E+00 | loss scale: 131072.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.417 | TFLOPs: 20.56 | +[default7]: iteration 2940/ 3814 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 3.50 | learning rate: 7.491E-05 | global batch size: 512 | lm loss: 4.923027E+00 | loss scale: 131072.0 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.373 | TFLOPs: 20.55 | +[default7]: iteration 2950/ 3814 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 3.50 | learning rate: 7.329E-05 | global batch size: 512 | lm loss: 4.921002E+00 | loss scale: 131072.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.389 | TFLOPs: 20.55 | +[default7]: iteration 2960/ 3814 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 3.50 | learning rate: 7.169E-05 | global batch size: 512 | lm loss: 4.930026E+00 | loss scale: 131072.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.416 | TFLOPs: 20.56 | +[default7]: iteration 2970/ 3814 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 3.50 | learning rate: 7.011E-05 | global batch size: 512 | lm loss: 4.923178E+00 | loss scale: 131072.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.417 | TFLOPs: 20.56 | +[default7]: iteration 2980/ 3814 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 3.50 | learning rate: 6.854E-05 | global batch size: 512 | lm loss: 4.920733E+00 | loss scale: 131072.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.400 | TFLOPs: 20.55 | +[default7]: iteration 2990/ 3814 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 3.50 | learning rate: 6.698E-05 | global batch size: 512 | lm loss: 4.921407E+00 | loss scale: 131072.0 | grad norm: 0.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.423 | TFLOPs: 20.56 | +[default7]: iteration 3000/ 3814 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 3.50 | learning rate: 6.544E-05 | global batch size: 512 | lm loss: 4.919156E+00 | loss scale: 262144.0 | grad norm: 0.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.423 | TFLOPs: 20.56 | +[default7]:------------------------------------------------------------------------------------------ +[default7]:valid loss at iteration 3000 | lm loss value: 4.833266E+00 | lm loss PPL: 1.256206E+02 | +[default7]:------------------------------------------------------------------------------------------ +[default0]:saving checkpoint at iteration 3000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-03 10:46:47,019] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +[default0]:[2023-02-03 10:46:47,023] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_01-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_01-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_04-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_04-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_05-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_05-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_06-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_06-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_07-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_07-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_08-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_08-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_09-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_09-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_10-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_10-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_11-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_11-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,072] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_12-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_12-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_13-model_00-model_states.pt... +[default6]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default3]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default5]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2023-02-03 10:46:47,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_13-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_14-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_14-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_15-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_15-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_16-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_16-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_17-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_17-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_18-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_18-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_19-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_19-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,093] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_20-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_20-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_21-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_21-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,099] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_22-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_22-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_23-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_23-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_24-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_24-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_25-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_25-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_26-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_26-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_27-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_27-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,116] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_28-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_28-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_29-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_29-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,121] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_30-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_30-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_31-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_31-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_32-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_32-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_33-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_33-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_34-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_34-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,133] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_35-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_35-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_37-model_00-model_states.pt... +[default0]:[2023-02-03 10:46:47,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/layer_37-model_00-model_states.pt. +[default0]:[2023-02-03 10:46:47,138] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_00_model_states.pt +[default0]:[2023-02-03 10:46:47,138] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_00_model_states.pt... +[default0]:[2023-02-03 10:46:47,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/mp_rank_00_model_states.pt. +[default0]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default7]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default1]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default1]:[2023-02-03 10:46:47,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2023-02-03 10:46:47,217] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2023-02-03 10:46:47,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2023-02-03 10:46:47,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2023-02-03 10:46:47,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2023-02-03 10:46:47,229] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2023-02-03 10:46:47,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2023-02-03 10:46:47,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2023-02-03 10:46:47,222] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default2]:[2023-02-03 10:46:47,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-03 10:46:47,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2023-02-03 10:46:47,223] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default5]:[2023-02-03 10:46:47,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2023-02-03 10:46:47,219] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default5]:[2023-02-03 10:46:47,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2023-02-03 10:46:47,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2023-02-03 10:46:47,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-03 10:46:47,223] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-03 10:46:47,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]: successfully saved checkpoint at iteration 3000 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default7]:[2023-02-03 10:46:47,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2023-02-03 10:46:47,225] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default7]:[2023-02-03 10:46:47,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:time (ms) | save-checkpoint: 212.43 +[default4]:[2023-02-03 10:46:47,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2023-02-03 10:46:47,229] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2023-02-03 10:46:47,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]: iteration 3010/ 3814 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 13.81 | learning rate: 6.392E-05 | global batch size: 512 | lm loss: 4.921815E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 37.086 | TFLOPs: 5.21 | +[default7]: iteration 3020/ 3814 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 3.50 | learning rate: 6.241E-05 | global batch size: 512 | lm loss: 4.915068E+00 | loss scale: 262144.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.358 | TFLOPs: 20.55 | +[default7]: iteration 3030/ 3814 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 3.50 | learning rate: 6.091E-05 | global batch size: 512 | lm loss: 4.921943E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.346 | TFLOPs: 20.55 | +[default7]: iteration 3040/ 3814 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 3.50 | learning rate: 5.944E-05 | global batch size: 512 | lm loss: 4.910722E+00 | loss scale: 262144.0 | grad norm: 0.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.359 | TFLOPs: 20.55 | +[default7]: iteration 3050/ 3814 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 3.50 | learning rate: 5.797E-05 | global batch size: 512 | lm loss: 4.909994E+00 | loss scale: 262144.0 | grad norm: 0.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.347 | TFLOPs: 20.55 | +[default7]: iteration 3060/ 3814 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 3.50 | learning rate: 5.653E-05 | global batch size: 512 | lm loss: 4.915544E+00 | loss scale: 262144.0 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.339 | TFLOPs: 20.54 | +[default7]: iteration 3070/ 3814 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 3.50 | learning rate: 5.510E-05 | global batch size: 512 | lm loss: 4.907784E+00 | loss scale: 262144.0 | grad norm: 0.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 3080/ 3814 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 3.50 | learning rate: 5.368E-05 | global batch size: 512 | lm loss: 4.903420E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.340 | TFLOPs: 20.54 | +[default7]: iteration 3090/ 3814 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 3.50 | learning rate: 5.228E-05 | global batch size: 512 | lm loss: 4.910515E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.304 | TFLOPs: 20.54 | +[default7]: iteration 3100/ 3814 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 3.50 | learning rate: 5.090E-05 | global batch size: 512 | lm loss: 4.904255E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.354 | TFLOPs: 20.55 | +[default7]: iteration 3110/ 3814 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 3.50 | learning rate: 4.953E-05 | global batch size: 512 | lm loss: 4.904956E+00 | loss scale: 262144.0 | grad norm: 0.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.302 | TFLOPs: 20.54 | +[default7]: iteration 3120/ 3814 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 3.50 | learning rate: 4.819E-05 | global batch size: 512 | lm loss: 4.903614E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 3130/ 3814 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 3.50 | learning rate: 4.685E-05 | global batch size: 512 | lm loss: 4.899263E+00 | loss scale: 262144.0 | grad norm: 0.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.300 | TFLOPs: 20.54 | +[default7]: iteration 3140/ 3814 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 3.50 | learning rate: 4.554E-05 | global batch size: 512 | lm loss: 4.907006E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.283 | TFLOPs: 20.54 | +[default7]: iteration 3150/ 3814 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 3.50 | learning rate: 4.424E-05 | global batch size: 512 | lm loss: 4.898956E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.305 | TFLOPs: 20.54 | +[default7]: iteration 3160/ 3814 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 3.50 | learning rate: 4.296E-05 | global batch size: 512 | lm loss: 4.903490E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.242 | TFLOPs: 20.53 | +[default7]: iteration 3170/ 3814 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 3.51 | learning rate: 4.169E-05 | global batch size: 512 | lm loss: 4.906436E+00 | loss scale: 262144.0 | grad norm: 0.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.058 | TFLOPs: 20.51 | +[default7]: iteration 3180/ 3814 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 3.50 | learning rate: 4.044E-05 | global batch size: 512 | lm loss: 4.903299E+00 | loss scale: 262144.0 | grad norm: 0.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.166 | TFLOPs: 20.52 | +[default7]: iteration 3190/ 3814 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 3.50 | learning rate: 3.921E-05 | global batch size: 512 | lm loss: 4.908155E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.187 | TFLOPs: 20.52 | +[default7]: iteration 3200/ 3814 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 3.51 | learning rate: 3.800E-05 | global batch size: 512 | lm loss: 4.898455E+00 | loss scale: 262144.0 | grad norm: 0.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.066 | TFLOPs: 20.51 | +[default7]: iteration 3210/ 3814 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 3.50 | learning rate: 3.681E-05 | global batch size: 512 | lm loss: 4.899220E+00 | loss scale: 262144.0 | grad norm: 0.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.218 | TFLOPs: 20.53 | +[default7]: iteration 3220/ 3814 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 3.50 | learning rate: 3.563E-05 | global batch size: 512 | lm loss: 4.903805E+00 | loss scale: 262144.0 | grad norm: 0.070 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.171 | TFLOPs: 20.52 | +[default7]: iteration 3230/ 3814 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 3.50 | learning rate: 3.447E-05 | global batch size: 512 | lm loss: 4.894789E+00 | loss scale: 262144.0 | grad norm: 0.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.139 | TFLOPs: 20.52 | +[default7]: iteration 3240/ 3814 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 3.50 | learning rate: 3.333E-05 | global batch size: 512 | lm loss: 4.892554E+00 | loss scale: 262144.0 | grad norm: 0.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.116 | TFLOPs: 20.51 | +[default7]: iteration 3250/ 3814 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 3.50 | learning rate: 3.220E-05 | global batch size: 512 | lm loss: 4.886610E+00 | loss scale: 262144.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.142 | TFLOPs: 20.52 | +[default7]: iteration 3260/ 3814 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 3.50 | learning rate: 3.109E-05 | global batch size: 512 | lm loss: 4.893465E+00 | loss scale: 262144.0 | grad norm: 0.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.101 | TFLOPs: 20.51 | +[default7]: iteration 3270/ 3814 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 3.50 | learning rate: 3.001E-05 | global batch size: 512 | lm loss: 4.890718E+00 | loss scale: 262144.0 | grad norm: 0.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.183 | TFLOPs: 20.52 | +[default7]: iteration 3280/ 3814 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 3.50 | learning rate: 2.894E-05 | global batch size: 512 | lm loss: 4.896983E+00 | loss scale: 262144.0 | grad norm: 0.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.133 | TFLOPs: 20.52 | +[default7]: iteration 3290/ 3814 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 3.50 | learning rate: 2.788E-05 | global batch size: 512 | lm loss: 4.882351E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.158 | TFLOPs: 20.52 | +[default7]: iteration 3300/ 3814 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 3.50 | learning rate: 2.685E-05 | global batch size: 512 | lm loss: 4.893690E+00 | loss scale: 262144.0 | grad norm: 0.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.191 | TFLOPs: 20.52 | +[default7]: iteration 3310/ 3814 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 3.50 | learning rate: 2.584E-05 | global batch size: 512 | lm loss: 4.897583E+00 | loss scale: 262144.0 | grad norm: 0.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.157 | TFLOPs: 20.52 | +[default7]: iteration 3320/ 3814 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 3.50 | learning rate: 2.484E-05 | global batch size: 512 | lm loss: 4.892740E+00 | loss scale: 262144.0 | grad norm: 0.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.192 | TFLOPs: 20.52 | +[default7]: iteration 3330/ 3814 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 3.50 | learning rate: 2.386E-05 | global batch size: 512 | lm loss: 4.892134E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.186 | TFLOPs: 20.52 | +[default7]: iteration 3340/ 3814 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 3.50 | learning rate: 2.290E-05 | global batch size: 512 | lm loss: 4.893458E+00 | loss scale: 262144.0 | grad norm: 0.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.162 | TFLOPs: 20.52 | +[default7]: iteration 3350/ 3814 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 3.50 | learning rate: 2.196E-05 | global batch size: 512 | lm loss: 4.901659E+00 | loss scale: 262144.0 | grad norm: 0.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.169 | TFLOPs: 20.52 | +[default7]: iteration 3360/ 3814 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 3.50 | learning rate: 2.104E-05 | global batch size: 512 | lm loss: 4.891296E+00 | loss scale: 262144.0 | grad norm: 0.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.078 | TFLOPs: 20.51 | +[default7]: iteration 3370/ 3814 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 3.50 | learning rate: 2.014E-05 | global batch size: 512 | lm loss: 4.890177E+00 | loss scale: 262144.0 | grad norm: 0.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.114 | TFLOPs: 20.51 | +[default7]: iteration 3380/ 3814 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 3.50 | learning rate: 1.925E-05 | global batch size: 512 | lm loss: 4.892258E+00 | loss scale: 262144.0 | grad norm: 0.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.128 | TFLOPs: 20.52 | +[default7]: iteration 3390/ 3814 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 3.50 | learning rate: 1.839E-05 | global batch size: 512 | lm loss: 4.880137E+00 | loss scale: 262144.0 | grad norm: 0.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.084 | TFLOPs: 20.51 | +[default7]: iteration 3400/ 3814 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 3.50 | learning rate: 1.754E-05 | global batch size: 512 | lm loss: 4.896400E+00 | loss scale: 262144.0 | grad norm: 0.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.123 | TFLOPs: 20.51 | +[default7]: iteration 3410/ 3814 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 3.50 | learning rate: 1.672E-05 | global batch size: 512 | lm loss: 4.899175E+00 | loss scale: 262144.0 | grad norm: 0.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.170 | TFLOPs: 20.52 | +[default7]: iteration 3420/ 3814 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 3.50 | learning rate: 1.591E-05 | global batch size: 512 | lm loss: 4.890435E+00 | loss scale: 262144.0 | grad norm: 0.059 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.191 | TFLOPs: 20.52 | +[default7]: iteration 3430/ 3814 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 3.50 | learning rate: 1.512E-05 | global batch size: 512 | lm loss: 4.890594E+00 | loss scale: 262144.0 | grad norm: 0.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.131 | TFLOPs: 20.52 | +[default7]: iteration 3440/ 3814 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 3.50 | learning rate: 1.435E-05 | global batch size: 512 | lm loss: 4.888838E+00 | loss scale: 262144.0 | grad norm: 0.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.101 | TFLOPs: 20.51 | +[default7]: iteration 3450/ 3814 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 3.51 | learning rate: 1.360E-05 | global batch size: 512 | lm loss: 4.890282E+00 | loss scale: 262144.0 | grad norm: 0.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.037 | TFLOPs: 20.50 | +[default7]: iteration 3460/ 3814 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 3.50 | learning rate: 1.287E-05 | global batch size: 512 | lm loss: 4.892103E+00 | loss scale: 262144.0 | grad norm: 0.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.185 | TFLOPs: 20.52 | +[default7]: iteration 3470/ 3814 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 3.50 | learning rate: 1.217E-05 | global batch size: 512 | lm loss: 4.902133E+00 | loss scale: 262144.0 | grad norm: 0.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.269 | TFLOPs: 20.53 | +[default7]: iteration 3480/ 3814 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 3.50 | learning rate: 1.147E-05 | global batch size: 512 | lm loss: 4.889071E+00 | loss scale: 262144.0 | grad norm: 0.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.272 | TFLOPs: 20.54 | +[default7]: iteration 3490/ 3814 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 3.50 | learning rate: 1.080E-05 | global batch size: 512 | lm loss: 4.895580E+00 | loss scale: 262144.0 | grad norm: 0.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.325 | TFLOPs: 20.54 | +[default7]: iteration 3500/ 3814 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 3.50 | learning rate: 1.015E-05 | global batch size: 512 | lm loss: 4.889579E+00 | loss scale: 524288.0 | grad norm: 0.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.330 | TFLOPs: 20.54 | +[default7]: iteration 3510/ 3814 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 3.50 | learning rate: 9.523E-06 | global batch size: 512 | lm loss: 4.888856E+00 | loss scale: 524288.0 | grad norm: 0.057 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.307 | TFLOPs: 20.54 | +[default7]: iteration 3520/ 3814 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 3.50 | learning rate: 8.911E-06 | global batch size: 512 | lm loss: 4.889877E+00 | loss scale: 524288.0 | grad norm: 0.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.331 | TFLOPs: 20.54 | +[default7]: iteration 3530/ 3814 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 3.50 | learning rate: 8.320E-06 | global batch size: 512 | lm loss: 4.879252E+00 | loss scale: 524288.0 | grad norm: 0.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.318 | TFLOPs: 20.54 | +[default7]: iteration 3540/ 3814 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 3.50 | learning rate: 7.749E-06 | global batch size: 512 | lm loss: 4.877120E+00 | loss scale: 524288.0 | grad norm: 0.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.314 | TFLOPs: 20.54 | +[default7]: iteration 3550/ 3814 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 3.50 | learning rate: 7.198E-06 | global batch size: 512 | lm loss: 4.889153E+00 | loss scale: 524288.0 | grad norm: 0.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.320 | TFLOPs: 20.54 | +[default7]: iteration 3560/ 3814 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 3.50 | learning rate: 6.667E-06 | global batch size: 512 | lm loss: 4.889434E+00 | loss scale: 524288.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.299 | TFLOPs: 20.54 | +[default7]: iteration 3570/ 3814 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 3.50 | learning rate: 6.155E-06 | global batch size: 512 | lm loss: 4.885847E+00 | loss scale: 524288.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.268 | TFLOPs: 20.53 | +[default7]: iteration 3580/ 3814 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 3.50 | learning rate: 5.665E-06 | global batch size: 512 | lm loss: 4.893823E+00 | loss scale: 524288.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.306 | TFLOPs: 20.54 | +[default7]: iteration 3590/ 3814 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 3.50 | learning rate: 5.194E-06 | global batch size: 512 | lm loss: 4.887161E+00 | loss scale: 524288.0 | grad norm: 0.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.317 | TFLOPs: 20.54 | +[default7]: iteration 3600/ 3814 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 3.50 | learning rate: 4.743E-06 | global batch size: 512 | lm loss: 4.880912E+00 | loss scale: 524288.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.284 | TFLOPs: 20.54 | +[default7]: iteration 3610/ 3814 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 3.50 | learning rate: 4.313E-06 | global batch size: 512 | lm loss: 4.882924E+00 | loss scale: 524288.0 | grad norm: 0.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.293 | TFLOPs: 20.54 | +[default7]: iteration 3620/ 3814 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 3.50 | learning rate: 3.903E-06 | global batch size: 512 | lm loss: 4.888158E+00 | loss scale: 524288.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.314 | TFLOPs: 20.54 | +[default7]: iteration 3630/ 3814 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 3.50 | learning rate: 3.513E-06 | global batch size: 512 | lm loss: 4.887733E+00 | loss scale: 524288.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.308 | TFLOPs: 20.54 | +[default7]: iteration 3640/ 3814 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 3.50 | learning rate: 3.144E-06 | global batch size: 512 | lm loss: 4.888723E+00 | loss scale: 524288.0 | grad norm: 0.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.310 | TFLOPs: 20.54 | +[default7]: iteration 3650/ 3814 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 3.50 | learning rate: 2.795E-06 | global batch size: 512 | lm loss: 4.887241E+00 | loss scale: 524288.0 | grad norm: 0.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.309 | TFLOPs: 20.54 | +[default7]: iteration 3660/ 3814 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 3.50 | learning rate: 2.466E-06 | global batch size: 512 | lm loss: 4.882038E+00 | loss scale: 524288.0 | grad norm: 0.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.271 | TFLOPs: 20.54 | +[default7]: iteration 3670/ 3814 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 3.50 | learning rate: 2.158E-06 | global batch size: 512 | lm loss: 4.877400E+00 | loss scale: 524288.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.250 | TFLOPs: 20.53 | +[default7]: iteration 3680/ 3814 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 3.50 | learning rate: 1.871E-06 | global batch size: 512 | lm loss: 4.880681E+00 | loss scale: 524288.0 | grad norm: 0.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.251 | TFLOPs: 20.53 | +[default7]: iteration 3690/ 3814 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 3.50 | learning rate: 1.603E-06 | global batch size: 512 | lm loss: 4.880221E+00 | loss scale: 524288.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.271 | TFLOPs: 20.54 | +[default7]: iteration 3700/ 3814 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 3.50 | learning rate: 1.357E-06 | global batch size: 512 | lm loss: 4.891531E+00 | loss scale: 524288.0 | grad norm: 0.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.290 | TFLOPs: 20.54 | +[default7]: iteration 3710/ 3814 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 3.50 | learning rate: 1.131E-06 | global batch size: 512 | lm loss: 4.887794E+00 | loss scale: 524288.0 | grad norm: 0.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.285 | TFLOPs: 20.54 | +[default7]: iteration 3720/ 3814 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 3.50 | learning rate: 9.251E-07 | global batch size: 512 | lm loss: 4.885019E+00 | loss scale: 524288.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.273 | TFLOPs: 20.54 | +[default7]: iteration 3730/ 3814 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 3.50 | learning rate: 7.401E-07 | global batch size: 512 | lm loss: 4.886095E+00 | loss scale: 524288.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.299 | TFLOPs: 20.54 | +[default7]: iteration 3740/ 3814 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 3.50 | learning rate: 5.758E-07 | global batch size: 512 | lm loss: 4.890682E+00 | loss scale: 524288.0 | grad norm: 0.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.301 | TFLOPs: 20.54 | +[default7]: iteration 3750/ 3814 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 3.50 | learning rate: 4.320E-07 | global batch size: 512 | lm loss: 4.891643E+00 | loss scale: 524288.0 | grad norm: 0.046 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.302 | TFLOPs: 20.54 | +[default7]: iteration 3760/ 3814 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 3.50 | learning rate: 3.088E-07 | global batch size: 512 | lm loss: 4.886123E+00 | loss scale: 524288.0 | grad norm: 0.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.286 | TFLOPs: 20.54 | +[default7]: iteration 3770/ 3814 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 3.50 | learning rate: 2.062E-07 | global batch size: 512 | lm loss: 4.885649E+00 | loss scale: 524288.0 | grad norm: 0.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.292 | TFLOPs: 20.54 | +[default7]: iteration 3780/ 3814 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 3.50 | learning rate: 1.243E-07 | global batch size: 512 | lm loss: 4.882343E+00 | loss scale: 524288.0 | grad norm: 0.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.291 | TFLOPs: 20.54 | +[default7]: iteration 3790/ 3814 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 3.50 | learning rate: 6.296E-08 | global batch size: 512 | lm loss: 4.881754E+00 | loss scale: 524288.0 | grad norm: 0.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.293 | TFLOPs: 20.54 | +[default7]: iteration 3800/ 3814 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 3.50 | learning rate: 2.230E-08 | global batch size: 512 | lm loss: 4.890694E+00 | loss scale: 524288.0 | grad norm: 0.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.168 | TFLOPs: 20.52 | +[default7]: iteration 3810/ 3814 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 3.51 | learning rate: 2.278E-09 | global batch size: 512 | lm loss: 4.891231E+00 | loss scale: 524288.0 | grad norm: 0.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.052 | TFLOPs: 20.50 | +[default0]:[after training is done] datetime: 2023-02-03 11:34:17 +[default0]:saving checkpoint at iteration 3814 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-03 11:35:58,566] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3814 is begin to save! +[default0]:[2023-02-03 11:35:58,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_01-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_01-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_04-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_04-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_05-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_05-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_06-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_06-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_07-model_00-model_states.pt... +[default7]:------------------------------------------------------------------------------------------------------------ +[default7]:valid loss at the end of training for val data | lm loss value: 4.800495E+00 | lm loss PPL: 1.215706E+02 | +[default7]:------------------------------------------------------------------------------------------------------------ +[default0]:[2023-02-03 11:35:58,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_07-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_08-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_08-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_09-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_09-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_10-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_10-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_11-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_11-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_12-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_12-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_13-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_13-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_14-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_14-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_15-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_15-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,650] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_16-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_16-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_17-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_17-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_18-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_18-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_19-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_19-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_20-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_20-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,665] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_21-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_21-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_22-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_22-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_23-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_23-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_24-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_24-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_25-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_25-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_26-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_26-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_27-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_27-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_28-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_28-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_29-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_29-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_30-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_30-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_31-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_31-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_32-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_32-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,694] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_33-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_33-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_34-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_34-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_35-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_35-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_37-model_00-model_states.pt... +[default0]:[2023-02-03 11:35:58,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/layer_37-model_00-model_states.pt. +[default0]:[2023-02-03 11:35:58,702] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/mp_rank_00_model_states.pt +[default0]:[2023-02-03 11:35:58,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/mp_rank_00_model_states.pt... +[default0]:[2023-02-03 11:35:58,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/mp_rank_00_model_states.pt. +[default0]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default3]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default7]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default2]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default2]:[2023-02-03 11:35:58,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2023-02-03 11:35:58,770] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default2]:[2023-02-03 11:35:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default1]:[2023-02-03 11:35:58,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default1]:[2023-02-03 11:35:58,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2023-02-03 11:35:58,766] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2023-02-03 11:35:58,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default0]:[2023-02-03 11:35:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-03 11:35:58,763] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-03 11:35:58,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default0]: successfully saved checkpoint at iteration 3814 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default5]:[2023-02-03 11:35:58,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2023-02-03 11:35:58,786] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default5]:[2023-02-03 11:35:58,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default3]:[2023-02-03 11:35:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2023-02-03 11:35:58,776] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2023-02-03 11:35:58,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default4]:[2023-02-03 11:35:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2023-02-03 11:35:58,780] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2023-02-03 11:35:58,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default6]:[2023-02-03 11:35:58,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2023-02-03 11:35:58,772] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2023-02-03 11:35:58,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default7]:[2023-02-03 11:35:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2023-02-03 11:35:58,771] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-39M-lr0.001-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3814/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default7]:[2023-02-03 11:35:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3814 is ready now! +[default0]:Evaluating iter 10/100 +[default0]:Evaluating iter 20/100 +[default0]:Evaluating iter 30/100 +[default0]:Evaluating iter 40/100 +[default0]:Evaluating iter 50/100 +[default0]:Evaluating iter 60/100 +[default0]:Evaluating iter 70/100 +[default0]:Evaluating iter 80/100 +[default0]:Evaluating iter 90/100 +[default0]:Evaluating iter 100/100 +[default7]:------------------------------------------------------------------------------------------------------------ +[default7]:test loss at the end of training for test data | lm loss value: 4.800749E+00 | lm loss PPL: 1.216014E+02 | +[default7]:------------------------------------------------------------------------------------------------------------