diff --git "a/training.log" "b/training.log" new file mode 100644--- /dev/null +++ "b/training.log" @@ -0,0 +1,2430 @@ +[2024-01-21 17:02:45,007] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:47,259] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2024-01-21 17:02:47,260] [INFO] [runner.py:571:main] cmd = /hpc2hdd/home/yli258/.conda/envs/graphR/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29501 --enable_each_rank_log=None main.py --data_path local/jsonfile --data_split 10,0,0 --model_name_or_path /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf --per_device_train_batch_size 4 --per_device_eval_batch_size 2 --max_seq_len 2048 --learning_rate 2e-5 --weight_decay 0. --num_train_epochs 3 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 100 --seed 1234 --zero_stage 3 --deepspeed --gradient_checkpointing --output_dir /hpc2hdd/home/yli258/jhaidata/Graph-Reasoning-LLM/ckpts/llama_7b_rft_v1_k6/ +[2024-01-21 17:02:48,893] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.15.5-1+cuda11.8 +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1 +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.15.5-1 +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NCCL_SOCKET_IFNAME=eth2 +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.15.5-1+cuda11.8 +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2 +[2024-01-21 17:02:50,774] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.15.5-1 +[2024-01-21 17:02:50,774] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2024-01-21 17:02:50,774] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=8, node_rank=0 +[2024-01-21 17:02:50,774] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2024-01-21 17:02:50,774] [INFO] [launch.py:163:main] dist_world_size=8 +[2024-01-21 17:02:50,774] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2024-01-21 17:02:52,821] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,845] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,845] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,855] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,855] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,886] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,955] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-01-21 17:02:52,955] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-21 17:02:55,412] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-21 17:02:55,485] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-21 17:02:55,493] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-21 17:02:55,493] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-21 17:02:55,506] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-01-21 17:02:55,514] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-21 17:02:55,559] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-21 17:02:56,343] [INFO] [comm.py:637:init_distributed] cdb=None +/hpc2hdd/home/yli258/.conda/envs/graphR/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations + warnings.warn( +[2024-01-21 17:02:56,403] [INFO] [comm.py:637:init_distributed] cdb=None +loading from ...loading from ...loading from ...loading from ... loading from ... loading from ...loading from .../hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hfloading from ... /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf + /hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf + +/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf +/hpc2hdd/home/yli258/olddata/yuhanli/LLaMA_v2_ckpts/hf/Llama-2-7b-hf + + + +[2024-01-21 17:02:58,469] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 6.74B + Loading checkpoint shards: 0%| | 0/2 [00:00 +[2024-01-21 17:11:50,601] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2024-01-21 17:11:50,601] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +[2024-01-21 17:11:51,182] [INFO] [utils.py:791:see_memory_usage] Stage 3 initialize beginning +[2024-01-21 17:11:51,183] [INFO] [utils.py:792:see_memory_usage] MA 2.06 GB Max_MA 2.61 GB CA 3.28 GB Max_CA 3 GB +[2024-01-21 17:11:51,183] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.51 GB, percent = 5.6% +[2024-01-21 17:11:51,185] [INFO] [stage3.py:127:__init__] Reduce bucket size 500,000,000 +[2024-01-21 17:11:51,185] [INFO] [stage3.py:128:__init__] Prefetch bucket size 30000000 +[2024-01-21 17:11:51,568] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2024-01-21 17:11:51,569] [INFO] [utils.py:792:see_memory_usage] MA 2.06 GB Max_MA 2.06 GB CA 3.28 GB Max_CA 3 GB +[2024-01-21 17:11:51,569] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.52 GB, percent = 5.6% +Parameter Offload: Total persistent parameters: 266240 in 65 params +[2024-01-21 17:11:51,957] [INFO] [utils.py:791:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2024-01-21 17:11:51,958] [INFO] [utils.py:792:see_memory_usage] MA 1.63 GB Max_MA 2.09 GB CA 3.28 GB Max_CA 3 GB +[2024-01-21 17:11:51,958] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.53 GB, percent = 5.6% +[2024-01-21 17:11:52,319] [INFO] [utils.py:791:see_memory_usage] Before creating fp16 partitions +[2024-01-21 17:11:52,320] [INFO] [utils.py:792:see_memory_usage] MA 1.63 GB Max_MA 1.63 GB CA 3.28 GB Max_CA 3 GB +[2024-01-21 17:11:52,320] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.53 GB, percent = 5.6% +[2024-01-21 17:11:53,849] [INFO] [utils.py:791:see_memory_usage] After creating fp16 partitions: 2 +[2024-01-21 17:11:53,850] [INFO] [utils.py:792:see_memory_usage] MA 1.63 GB Max_MA 1.63 GB CA 1.63 GB Max_CA 3 GB +[2024-01-21 17:11:53,851] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 115.11 GB, percent = 5.7% +[2024-01-21 17:11:54,223] [INFO] [utils.py:791:see_memory_usage] Before creating fp32 partitions +[2024-01-21 17:11:54,224] [INFO] [utils.py:792:see_memory_usage] MA 1.63 GB Max_MA 1.63 GB CA 1.63 GB Max_CA 2 GB +[2024-01-21 17:11:54,224] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.54 GB, percent = 5.6% +[2024-01-21 17:11:54,599] [INFO] [utils.py:791:see_memory_usage] After creating fp32 partitions +[2024-01-21 17:11:54,600] [INFO] [utils.py:792:see_memory_usage] MA 4.77 GB Max_MA 6.34 GB CA 6.34 GB Max_CA 6 GB +[2024-01-21 17:11:54,600] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.54 GB, percent = 5.6% +[2024-01-21 17:11:54,974] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states +[2024-01-21 17:11:54,975] [INFO] [utils.py:792:see_memory_usage] MA 4.77 GB Max_MA 4.77 GB CA 6.34 GB Max_CA 6 GB +[2024-01-21 17:11:54,975] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.53 GB, percent = 5.6% +[2024-01-21 17:11:55,353] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states +[2024-01-21 17:11:55,353] [INFO] [utils.py:792:see_memory_usage] MA 11.05 GB Max_MA 14.19 GB CA 15.76 GB Max_CA 16 GB +[2024-01-21 17:11:55,353] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.52 GB, percent = 5.6% +[2024-01-21 17:11:55,354] [INFO] [stage3.py:479:_setup_for_real_optimizer] optimizer state initialized +[2024-01-21 17:11:55,856] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer +[2024-01-21 17:11:55,856] [INFO] [utils.py:792:see_memory_usage] MA 13.55 GB Max_MA 14.04 GB CA 18.01 GB Max_CA 18 GB +[2024-01-21 17:11:55,857] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 113.86 GB, percent = 5.7% +[2024-01-21 17:11:55,857] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[2024-01-21 17:11:55,857] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2024-01-21 17:11:55,857] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2024-01-21 17:11:55,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:11:55,858] [INFO] [config.py:984:print] DeepSpeedEngine configuration: +[2024-01-21 17:11:55,858] [INFO] [config.py:988:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2024-01-21 17:11:55,858] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2024-01-21 17:11:55,858] [INFO] [config.py:988:print] amp_enabled .................. False +[2024-01-21 17:11:55,858] [INFO] [config.py:988:print] amp_params ................... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] bfloat16_enabled ............. True +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] comms_config ................. +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] communication_data_type ...... None +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] curriculum_params_legacy ..... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] data_efficiency_enabled ...... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] dataloader_drop_last ......... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] disable_allgather ............ False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] dump_state ................... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... None +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_enabled ........... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1 +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0 +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100 +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06 +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01 +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] eigenvalue_verbose ........... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] elasticity_enabled ........... False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] fp16_auto_cast ............... None +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] fp16_enabled ................. False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] global_rank .................. 0 +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] grad_accum_dtype ............. None +[2024-01-21 17:11:55,859] [INFO] [config.py:988:print] gradient_accumulation_steps .. 1 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] gradient_clipping ............ 1.0 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] graph_harvesting ............. False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] initial_dynamic_scale ........ 1 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] load_universal_checkpoint .... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] loss_scale ................... 1.0 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] memory_breakdown ............. False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] mics_hierarchial_params_gather False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] mics_shard_size .............. -1 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] optimizer_name ............... None +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] optimizer_params ............. None +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] pld_enabled .................. False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] pld_params ................... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] prescale_gradients ........... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] scheduler_name ............... None +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] scheduler_params ............. None +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] sparse_attention ............. None +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] steps_per_print .............. 10 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] train_batch_size ............. 32 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 4 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] use_node_local_storage ....... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] wall_clock_breakdown ......... False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] weight_quantization_config ... None +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] world_size ................... 8 +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] zero_allow_untested_optimizer False +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] zero_enabled ................. True +[2024-01-21 17:11:55,860] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True +[2024-01-21 17:11:55,861] [INFO] [config.py:988:print] zero_optimization_stage ...... 3 +[2024-01-21 17:11:55,861] [INFO] [config.py:974:print_user_config] json = { + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 4, + "steps_per_print": 10, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "stage3_param_persistence_threshold": 1.000000e+04, + "stage3_max_live_parameters": 3.000000e+07, + "stage3_prefetch_bucket_size": 3.000000e+07, + "memory_efficient_linear": false + }, + "bf16": { + "enabled": true, + "loss_scale_window": 50, + "min_loss_scale": 1e-10 + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "hybrid_engine": { + "enabled": false, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + } +} +***** Running training ***** +***** Evaluating perplexity, Epoch 0/3 ***** +[2024-01-21 17:11:56,034] [WARNING] [parameter_offload.py:86:_apply_to_tensors_only] A module has unknown inputs or outputs type () and the tensors embedded in it cannot be detected. The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and output tensors and therefore may not get triggered properly. +ppl: (30.828693389892578, 24475603042304.0) +Beginning of Epoch 1/3, Total Micro Batches 2329 +Epoch: 0, Total Step: 1, Loss: 24.211406707763672 +[2024-01-21 17:12:36,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[2.0000000000000003e-06, 2.0000000000000003e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:12:36,117] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=8.301860946949287, CurrSamplesPerSec=8.291449315140763, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 11, Loss: 19.955772399902344 +[2024-01-21 17:13:14,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[4.000000000000001e-06, 4.000000000000001e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:13:14,793] [INFO] [timer.py:260:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=8.287891755191987, CurrSamplesPerSec=8.253705313503444, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 21, Loss: 0.3317473828792572 +[2024-01-21 17:13:53,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[6e-06, 6e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:13:53,611] [INFO] [timer.py:260:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=8.273005417332326, CurrSamplesPerSec=8.249608794617835, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 31, Loss: 0.2295999825000763 +[2024-01-21 17:14:32,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[8.000000000000001e-06, 8.000000000000001e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:14:32,496] [INFO] [timer.py:260:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=8.262132164118825, CurrSamplesPerSec=8.21872339180001, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 41, Loss: 0.1584744155406952 +[2024-01-21 17:15:11,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:15:11,472] [INFO] [timer.py:260:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=8.251780669055808, CurrSamplesPerSec=8.21251424303653, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 51, Loss: 0.17504236102104187 +[2024-01-21 17:15:50,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[1.2e-05, 1.2e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:15:50,514] [INFO] [timer.py:260:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=8.242620723826704, CurrSamplesPerSec=8.202254821029905, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 61, Loss: 0.1482594758272171 +[2024-01-21 17:16:29,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[1.4e-05, 1.4e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:16:29,516] [INFO] [timer.py:260:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=8.237426860670835, CurrSamplesPerSec=8.222783241755145, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 71, Loss: 0.2160094827413559 +[2024-01-21 17:17:08,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[1.6000000000000003e-05, 1.6000000000000003e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:17:08,557] [INFO] [timer.py:260:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=8.2325026708751, CurrSamplesPerSec=8.200739313257413, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 81, Loss: 0.03728732466697693 +[2024-01-21 17:17:47,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.8e-05, 1.8e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:17:47,500] [INFO] [timer.py:260:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=8.231045268559809, CurrSamplesPerSec=8.199998802542861, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 91, Loss: 0.06438737362623215 +[2024-01-21 17:18:26,462] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:18:26,462] [INFO] [timer.py:260:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=8.229479437129255, CurrSamplesPerSec=8.185885067911, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 101, Loss: 0.044852279126644135 +[2024-01-21 17:19:05,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.9999895957966793e-05, 1.9999895957966793e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:19:05,421] [INFO] [timer.py:260:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=8.228267704244189, CurrSamplesPerSec=8.21717915419049, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 111, Loss: 0.08844884485006332 +[2024-01-21 17:19:44,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[1.9999583834032114e-05, 1.9999583834032114e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:19:44,413] [INFO] [timer.py:260:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=8.226674663169387, CurrSamplesPerSec=8.220355308854977, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 121, Loss: 0.10905247181653976 +[2024-01-21 17:20:23,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[1.999906363469077e-05, 1.999906363469077e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:20:23,381] [INFO] [timer.py:260:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=8.225726627632133, CurrSamplesPerSec=8.23894246085945, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 131, Loss: 0.02977502904832363 +[2024-01-21 17:21:02,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[1.999833537076728e-05, 1.999833537076728e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:21:02,402] [INFO] [timer.py:260:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=8.224104288705288, CurrSamplesPerSec=8.221204746009965, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 141, Loss: 0.08010333776473999 +[2024-01-21 17:21:41,475] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[1.999739905741565e-05, 1.999739905741565e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:21:41,476] [INFO] [timer.py:260:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=8.221938964217955, CurrSamplesPerSec=8.200404613908493, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 151, Loss: 0.16412055492401123 +[2024-01-21 17:22:20,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[1.9996254714119076e-05, 1.9996254714119076e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:22:20,444] [INFO] [timer.py:260:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=8.221465212219664, CurrSamplesPerSec=8.19060923615143, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 161, Loss: 0.18439224362373352 +[2024-01-21 17:22:59,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[1.9994902364689513e-05, 1.9994902364689513e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:22:59,455] [INFO] [timer.py:260:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=8.220503091160918, CurrSamplesPerSec=8.215062745730751, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 171, Loss: 0.0478808656334877 +[2024-01-21 17:23:38,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[1.9993342037267202e-05, 1.9993342037267202e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:23:38,439] [INFO] [timer.py:260:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=8.21997301956434, CurrSamplesPerSec=8.21932786046576, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 181, Loss: 0.03981553390622139 +[2024-01-21 17:24:17,381] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[1.9991573764320068e-05, 1.9991573764320068e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:24:17,381] [INFO] [timer.py:260:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=8.219971055376378, CurrSamplesPerSec=8.205622114006278, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 191, Loss: 0.13556432723999023 +[2024-01-21 17:24:56,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[1.998959758264306e-05, 1.998959758264306e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:24:56,392] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=8.219239341955252, CurrSamplesPerSec=8.200610540962902, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 201, Loss: 0.14130252599716187 +[2024-01-21 17:25:35,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[1.9987413533357358e-05, 1.9987413533357358e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:25:35,330] [INFO] [timer.py:260:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=8.21931679422773, CurrSamplesPerSec=8.221022457209779, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 211, Loss: 0.2083735316991806 +[2024-01-21 17:26:14,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[1.9985021661909556e-05, 1.9985021661909556e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:26:14,293] [INFO] [timer.py:260:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=8.219133840756864, CurrSamplesPerSec=8.20761219690653, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 221, Loss: 0.13403818011283875 +[2024-01-21 17:26:53,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[1.998242201807069e-05, 1.998242201807069e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:26:53,286] [INFO] [timer.py:260:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=8.21869953301881, CurrSamplesPerSec=8.239058278481568, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 231, Loss: 0.06681068241596222 +[2024-01-21 17:27:32,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[1.99796146559352e-05, 1.99796146559352e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:27:32,275] [INFO] [timer.py:260:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=8.218338898651945, CurrSamplesPerSec=8.199533922342953, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 241, Loss: 0.14689813554286957 +[2024-01-21 17:28:11,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[1.997659963391982e-05, 1.997659963391982e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:28:11,257] [INFO] [timer.py:260:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=8.21806555213872, CurrSamplesPerSec=8.211170255948504, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 251, Loss: 0.03474677726626396 +[2024-01-21 17:28:50,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[1.9973377014762352e-05, 1.9973377014762352e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:28:50,190] [INFO] [timer.py:260:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=8.21822209191955, CurrSamplesPerSec=8.226998386262458, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 261, Loss: 0.29164114594459534 +[2024-01-21 17:29:29,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[1.9969946865520372e-05, 1.9969946865520372e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:29:29,172] [INFO] [timer.py:260:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=8.217972718658102, CurrSamplesPerSec=8.185796700937571, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 271, Loss: 0.24175597727298737 +[2024-01-21 17:30:08,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[1.996630925756982e-05, 1.996630925756982e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:30:08,108] [INFO] [timer.py:260:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=8.218093211687759, CurrSamplesPerSec=8.20533617535483, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 281, Loss: 0.22860103845596313 +[2024-01-21 17:30:47,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[1.9962464266603517e-05, 1.9962464266603517e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:30:47,060] [INFO] [timer.py:260:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=8.218081566228031, CurrSamplesPerSec=8.230797378585864, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 291, Loss: 0.14682930707931519 +[2024-01-21 17:31:26,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[1.99584119726296e-05, 1.99584119726296e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:31:26,005] [INFO] [timer.py:260:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=8.218128005361944, CurrSamplesPerSec=8.225991963740302, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 301, Loss: 0.1054423600435257 +[2024-01-21 17:32:04,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[1.995415245996985e-05, 1.995415245996985e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:32:04,971] [INFO] [timer.py:260:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=8.218020088337276, CurrSamplesPerSec=8.1756013923001, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 311, Loss: 0.09509023278951645 +[2024-01-21 17:32:43,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[1.9949685817257935e-05, 1.9949685817257935e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:32:43,960] [INFO] [timer.py:260:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=8.217772892822689, CurrSamplesPerSec=8.187687771131769, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 321, Loss: 0.06107733026146889 +[2024-01-21 17:33:22,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[1.9945012137437583e-05, 1.9945012137437583e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:33:22,895] [INFO] [timer.py:260:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=8.217886791757627, CurrSamplesPerSec=8.243524544124822, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 331, Loss: 0.1336403638124466 +[2024-01-21 17:34:01,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[1.9940131517760616e-05, 1.9940131517760616e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:34:01,843] [INFO] [timer.py:260:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=8.217907424642574, CurrSamplesPerSec=8.219154714642555, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 341, Loss: 0.19123783707618713 +[2024-01-21 17:34:40,847] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[1.9935044059784953e-05, 1.9935044059784953e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:34:40,848] [INFO] [timer.py:260:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=8.217591709269419, CurrSamplesPerSec=8.215639519890516, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 351, Loss: 0.04643106460571289 +[2024-01-21 17:35:19,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[1.992974986937249e-05, 1.992974986937249e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:35:19,790] [INFO] [timer.py:260:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=8.217659585566695, CurrSamplesPerSec=8.211703276318275, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 361, Loss: 0.029723739251494408 +[2024-01-21 17:35:58,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[1.9924249056686893e-05, 1.9924249056686893e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:35:58,680] [INFO] [timer.py:260:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=8.218022226379004, CurrSamplesPerSec=8.238476188792585, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 371, Loss: 0.17806129157543182 +[2024-01-21 17:36:37,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[1.991854173619131e-05, 1.991854173619131e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:36:37,642] [INFO] [timer.py:260:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=8.217960931133712, CurrSamplesPerSec=8.210889455894261, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 381, Loss: 0.04113328084349632 +[2024-01-21 17:37:16,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[1.9912628026645993e-05, 1.9912628026645993e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:37:16,670] [INFO] [timer.py:260:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=8.217547885676073, CurrSamplesPerSec=8.193208678306608, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 391, Loss: 0.1758180409669876 +[2024-01-21 17:37:55,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[1.9906508051105802e-05, 1.9906508051105802e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:37:55,637] [INFO] [timer.py:260:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=8.217478192063506, CurrSamplesPerSec=8.229289470187757, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 401, Loss: 0.06669902801513672 +[2024-01-21 17:38:34,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[1.9900181936917686e-05, 1.9900181936917686e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:38:34,602] [INFO] [timer.py:260:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=8.217419832443051, CurrSamplesPerSec=8.172714011119877, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 411, Loss: 0.05748552829027176 +[2024-01-21 17:39:13,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[1.9893649815718e-05, 1.9893649815718e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:39:13,575] [INFO] [timer.py:260:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=8.217327850232902, CurrSamplesPerSec=8.241584823044555, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 421, Loss: 0.1349376142024994 +[2024-01-21 17:39:52,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[1.9886911823429776e-05, 1.9886911823429776e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:39:52,541] [INFO] [timer.py:260:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=8.217270371065807, CurrSamplesPerSec=8.198808654208323, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 431, Loss: 0.11941323429346085 +[2024-01-21 17:40:31,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[1.98799681002599e-05, 1.98799681002599e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:40:31,452] [INFO] [timer.py:260:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=8.217477412374489, CurrSamplesPerSec=8.231955936517064, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 441, Loss: 0.06861431151628494 +[2024-01-21 17:41:10,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[1.9872818790696186e-05, 1.9872818790696186e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:41:10,472] [INFO] [timer.py:260:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=8.217168194283893, CurrSamplesPerSec=8.203799972543516, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 451, Loss: 0.11355195939540863 +[2024-01-21 17:41:49,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[1.986546404350437e-05, 1.986546404350437e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:41:49,447] [INFO] [timer.py:260:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=8.21707524236449, CurrSamplesPerSec=8.183255843502673, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 461, Loss: 0.06028124317526817 +[2024-01-21 17:42:28,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[1.9857904011725033e-05, 1.9857904011725033e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:42:28,416] [INFO] [timer.py:260:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=8.21701876562333, CurrSamplesPerSec=8.208960040121957, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 471, Loss: 0.034633129835128784 +[2024-01-21 17:43:07,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[1.985013885267038e-05, 1.985013885267038e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:43:07,379] [INFO] [timer.py:260:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=8.216987356362488, CurrSamplesPerSec=8.240520186884066, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 481, Loss: 0.2307901531457901 +[2024-01-21 17:43:46,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[1.9842168727921006e-05, 1.9842168727921006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:43:46,410] [INFO] [timer.py:260:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=8.216665066053121, CurrSamplesPerSec=8.21998427017957, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 491, Loss: 0.0520152747631073 +[2024-01-21 17:44:25,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[1.98339938033225e-05, 1.98339938033225e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:44:25,373] [INFO] [timer.py:260:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=8.216642060220835, CurrSamplesPerSec=8.205117470819896, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 501, Loss: 0.17667724192142487 +[2024-01-21 17:45:04,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9825614248982025e-05, 1.9825614248982025e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:45:04,297] [INFO] [timer.py:260:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=8.216783772447966, CurrSamplesPerSec=8.213684246729004, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 511, Loss: 0.04984883591532707 +[2024-01-21 17:45:43,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.9817030239264753e-05, 1.9817030239264753e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:45:43,304] [INFO] [timer.py:260:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=8.2165791469321, CurrSamplesPerSec=8.193314210799926, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 521, Loss: 0.031982630491256714 +[2024-01-21 17:46:22,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.9808241952790245e-05, 1.9808241952790245e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:46:22,288] [INFO] [timer.py:260:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=8.21647571365042, CurrSamplesPerSec=8.235282986649375, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 531, Loss: 0.19390220940113068 +[2024-01-21 17:47:01,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.9799249572428744e-05, 1.9799249572428744e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:47:01,250] [INFO] [timer.py:260:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=8.216462512388773, CurrSamplesPerSec=8.215007435986797, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 541, Loss: 0.15826433897018433 +[2024-01-21 17:47:40,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.9790053285297356e-05, 1.9790053285297356e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:47:40,255] [INFO] [timer.py:260:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=8.216284345943006, CurrSamplesPerSec=8.158729672431974, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 551, Loss: 0.11331122368574142 +[2024-01-21 17:48:19,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.9780653282756162e-05, 1.9780653282756162e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:48:19,218] [INFO] [timer.py:260:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=8.216267590802167, CurrSamplesPerSec=8.18271304022321, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 561, Loss: 0.03512731194496155 +[2024-01-21 17:48:58,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[1.9771049760404236e-05, 1.9771049760404236e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:48:58,150] [INFO] [timer.py:260:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=8.216372313460882, CurrSamplesPerSec=8.230484447205564, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 571, Loss: 0.06923100352287292 +[2024-01-21 17:49:37,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[1.9761242918075584e-05, 1.9761242918075584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:49:37,113] [INFO] [timer.py:260:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=8.216355858446022, CurrSamplesPerSec=8.239605042316018, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 581, Loss: 0.07558969408273697 +[2024-01-21 17:50:16,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[1.975123295983496e-05, 1.975123295983496e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:50:16,078] [INFO] [timer.py:260:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=8.216337553525117, CurrSamplesPerSec=8.220383503122262, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 591, Loss: 0.27879956364631653 +[2024-01-21 17:50:55,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[1.9741020093973648e-05, 1.9741020093973648e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:50:55,017] [INFO] [timer.py:260:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=8.216410507234707, CurrSamplesPerSec=8.246019379026507, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 601, Loss: 0.09282868355512619 +[2024-01-21 17:51:34,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[1.9730604533005116e-05, 1.9730604533005116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:51:34,012] [INFO] [timer.py:260:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=8.216282005293477, CurrSamplesPerSec=8.2083927374493, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 611, Loss: 0.12027372419834137 +[2024-01-21 17:52:12,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[1.971998649366059e-05, 1.971998649366059e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:52:12,941] [INFO] [timer.py:260:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=8.216387629330569, CurrSamplesPerSec=8.228865155175162, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 621, Loss: 0.148494690656662 +[2024-01-21 17:52:51,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[1.9709166196884553e-05, 1.9709166196884553e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:52:51,865] [INFO] [timer.py:260:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=8.216507454508108, CurrSamplesPerSec=8.22222208608816, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 631, Loss: 0.03289279341697693 +[2024-01-21 17:53:30,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[1.969814386783014e-05, 1.969814386783014e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:53:30,765] [INFO] [timer.py:260:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=8.216698698301864, CurrSamplesPerSec=8.213651071929595, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 641, Loss: 0.12843376398086548 +[2024-01-21 17:54:09,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.968691973585445e-05, 1.968691973585445e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:54:09,761] [INFO] [timer.py:260:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=8.216572856759404, CurrSamplesPerSec=8.219334907252586, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 651, Loss: 0.13127140700817108 +[2024-01-21 17:54:48,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.9675494034513792e-05, 1.9675494034513792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:54:48,678] [INFO] [timer.py:260:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=8.216702929443956, CurrSamplesPerSec=8.224585099363395, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 661, Loss: 0.07959669083356857 +[2024-01-21 17:55:27,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[1.9663867001558805e-05, 1.9663867001558805e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:55:27,628] [INFO] [timer.py:260:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=8.216728203841157, CurrSamplesPerSec=8.220016489306387, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 671, Loss: 0.10658825188875198 +[2024-01-21 17:56:06,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[1.9652038878929516e-05, 1.9652038878929516e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:56:06,611] [INFO] [timer.py:260:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=8.21664775973802, CurrSamplesPerSec=8.210587077313082, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 681, Loss: 0.047979217022657394 +[2024-01-21 17:56:45,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[1.9640009912750313e-05, 1.9640009912750313e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:56:45,535] [INFO] [timer.py:260:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=8.216752451837895, CurrSamplesPerSec=8.235796400941457, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 691, Loss: 0.05715488642454147 +[2024-01-21 17:57:24,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[1.9627780353324816e-05, 1.9627780353324816e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:57:24,518] [INFO] [timer.py:260:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=8.21667437922945, CurrSamplesPerSec=8.224457592987363, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 701, Loss: 0.18884383141994476 +[2024-01-21 17:58:03,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[1.9615350455130666e-05, 1.9615350455130666e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:58:03,468] [INFO] [timer.py:260:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=8.216694976914107, CurrSamplesPerSec=8.216973903350295, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 711, Loss: 0.07247164845466614 +[2024-01-21 17:58:42,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=0, lr=[1.9602720476814246e-05, 1.9602720476814246e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:58:42,410] [INFO] [timer.py:260:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=8.216741462205087, CurrSamplesPerSec=8.216648943822564, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 721, Loss: 0.15960893034934998 +[2024-01-21 17:59:21,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=0, lr=[1.958989068118527e-05, 1.958989068118527e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 17:59:21,395] [INFO] [timer.py:260:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=8.216660176652958, CurrSamplesPerSec=8.202780167392842, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 731, Loss: 0.19701063632965088 +[2024-01-21 18:00:00,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=0, lr=[1.957686133521136e-05, 1.957686133521136e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:00:00,345] [INFO] [timer.py:260:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=8.216682476143374, CurrSamplesPerSec=8.23132285399059, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 741, Loss: 0.1315952092409134 +[2024-01-21 18:00:39,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=0, lr=[1.9563632710012426e-05, 1.9563632710012426e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:00:39,332] [INFO] [timer.py:260:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=8.216598627993067, CurrSamplesPerSec=8.214952126987608, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 751, Loss: 0.09428518265485764 +[2024-01-21 18:01:18,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=0, lr=[1.9550205080855097e-05, 1.9550205080855097e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:01:18,272] [INFO] [timer.py:260:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=8.216650664553566, CurrSamplesPerSec=8.217305428796994, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 761, Loss: 0.05911910533905029 +[2024-01-21 18:01:57,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=0, lr=[1.9536578727146928e-05, 1.9536578727146928e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:01:57,237] [INFO] [timer.py:260:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=8.216631961259678, CurrSamplesPerSec=8.22044089919255, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 771, Loss: 0.08703680336475372 +[2024-01-21 18:02:36,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=0, lr=[1.9522753932430633e-05, 1.9522753932430633e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:02:36,285] [INFO] [timer.py:260:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=8.216387617906875, CurrSamplesPerSec=8.225072986348918, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 781, Loss: 0.13196833431720734 +[2024-01-21 18:03:15,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=0, lr=[1.9508730984378164e-05, 1.9508730984378164e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:03:15,250] [INFO] [timer.py:260:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=8.216372295112034, CurrSamplesPerSec=8.205010630870973, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 791, Loss: 0.15777122974395752 +[2024-01-21 18:03:54,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[1.9494510174784725e-05, 1.9494510174784725e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:03:54,239] [INFO] [timer.py:260:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=8.21629277217411, CurrSamplesPerSec=8.212927827384368, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 801, Loss: 0.18616990745067596 +[2024-01-21 18:04:33,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=0, lr=[1.9480091799562706e-05, 1.9480091799562706e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:04:33,163] [INFO] [timer.py:260:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=8.216383141572651, CurrSamplesPerSec=8.188016438047628, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 811, Loss: 0.20894335210323334 +[2024-01-21 18:05:12,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=0, lr=[1.946547615873552e-05, 1.946547615873552e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:05:12,148] [INFO] [timer.py:260:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=8.216316605591492, CurrSamplesPerSec=8.207223236948906, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 821, Loss: 0.09269611537456512 +[2024-01-21 18:05:51,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=0, lr=[1.945066355643136e-05, 1.945066355643136e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:05:51,102] [INFO] [timer.py:260:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=8.216330959907467, CurrSamplesPerSec=8.206878975408053, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 831, Loss: 0.03543330729007721 +[2024-01-21 18:06:30,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=0, lr=[1.943565430087689e-05, 1.943565430087689e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:06:30,127] [INFO] [timer.py:260:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=8.216167419518161, CurrSamplesPerSec=8.195024115954899, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 841, Loss: 0.18056733906269073 +[2024-01-21 18:07:09,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=0, lr=[1.9420448704390792e-05, 1.9420448704390792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:07:09,099] [INFO] [timer.py:260:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=8.2161366132884, CurrSamplesPerSec=8.185469709803659, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 851, Loss: 0.09241245687007904 +[2024-01-21 18:07:48,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=0, lr=[1.9405047083377305e-05, 1.9405047083377305e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:07:48,071] [INFO] [timer.py:260:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=8.21610933261812, CurrSamplesPerSec=8.19473841508927, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 861, Loss: 0.0623469203710556 +[2024-01-21 18:08:27,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=0, lr=[1.9389449758319624e-05, 1.9389449758319624e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:08:27,016] [INFO] [timer.py:260:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=8.2161441040864, CurrSamplesPerSec=8.20389324171614, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 871, Loss: 0.23640227317810059 +[2024-01-21 18:09:06,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=0, lr=[1.937365705377323e-05, 1.937365705377323e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:09:06,027] [INFO] [timer.py:260:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=8.216023829906927, CurrSamplesPerSec=8.20926029043487, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 881, Loss: 0.0813523456454277 +[2024-01-21 18:09:44,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=0, lr=[1.9357669298359137e-05, 1.9357669298359137e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:09:44,995] [INFO] [timer.py:260:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=8.216006292470068, CurrSamplesPerSec=8.14537291049017, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 891, Loss: 0.16881151497364044 +[2024-01-21 18:10:24,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[1.9341486824757068e-05, 1.9341486824757068e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:10:24,004] [INFO] [timer.py:260:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=8.215893416548148, CurrSamplesPerSec=8.212787113575002, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 901, Loss: 0.11007864028215408 +[2024-01-21 18:11:03,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=0, lr=[1.9325109969698507e-05, 1.9325109969698507e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:11:03,023] [INFO] [timer.py:260:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=8.215759846356379, CurrSamplesPerSec=8.214323667946717, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 911, Loss: 0.13128942251205444 +[2024-01-21 18:11:41,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=0, lr=[1.930853907395972e-05, 1.930853907395972e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:11:41,978] [INFO] [timer.py:260:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=8.21577368191725, CurrSamplesPerSec=8.217772829533248, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 921, Loss: 0.16916492581367493 +[2024-01-21 18:12:20,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=0, lr=[1.929177448235464e-05, 1.929177448235464e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:12:20,965] [INFO] [timer.py:260:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=8.215717007384661, CurrSamplesPerSec=8.2260786798336, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 931, Loss: 0.04492010548710823 +[2024-01-21 18:12:59,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=0, lr=[1.927481654372771e-05, 1.927481654372771e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:12:59,931] [INFO] [timer.py:260:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=8.215705975337684, CurrSamplesPerSec=8.22173403361167, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 941, Loss: 0.12300165742635727 +[2024-01-21 18:13:38,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=0, lr=[1.9257665610946604e-05, 1.9257665610946604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:13:38,899] [INFO] [timer.py:260:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=8.215694288036724, CurrSamplesPerSec=8.216534258361056, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 951, Loss: 0.1626824289560318 +[2024-01-21 18:14:17,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=0, lr=[1.9240322040894916e-05, 1.9240322040894916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:14:17,851] [INFO] [timer.py:260:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=8.215715548825028, CurrSamplesPerSec=8.232278068513684, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 961, Loss: 0.1052757278084755 +[2024-01-21 18:14:56,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=0, lr=[1.92227861944647e-05, 1.92227861944647e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:14:56,822] [INFO] [timer.py:260:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=8.215696099024566, CurrSamplesPerSec=8.222379745966926, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 971, Loss: 0.05269880220293999 +[2024-01-21 18:15:35,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=0, lr=[1.920505843654898e-05, 1.920505843654898e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:15:35,838] [INFO] [timer.py:260:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=8.215579093851936, CurrSamplesPerSec=8.19456830494665, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 981, Loss: 0.13983123004436493 +[2024-01-21 18:16:14,813] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=0, lr=[1.918713913603415e-05, 1.918713913603415e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:16:14,813] [INFO] [timer.py:260:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=8.215552199697425, CurrSamplesPerSec=8.225251926351433, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 991, Loss: 0.1506224274635315 +[2024-01-21 18:16:53,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[1.9169028665792303e-05, 1.9169028665792303e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:16:53,800] [INFO] [timer.py:260:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=8.215501740460997, CurrSamplesPerSec=8.215179401450822, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1001, Loss: 0.2549113631248474 +[2024-01-21 18:17:32,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=0, lr=[1.915072740267347e-05, 1.915072740267347e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:17:32,756] [INFO] [timer.py:260:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=8.215517467643801, CurrSamplesPerSec=8.209652456444585, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1011, Loss: 0.04378005117177963 +[2024-01-21 18:18:11,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=0, lr=[1.913223572749777e-05, 1.913223572749777e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:18:11,706] [INFO] [timer.py:260:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=8.215543391496817, CurrSamplesPerSec=8.249710714344106, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1021, Loss: 0.028377555310726166 +[2024-01-21 18:18:50,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=0, lr=[1.9113554025047507e-05, 1.9113554025047507e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:18:50,694] [INFO] [timer.py:260:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=8.215491563360954, CurrSamplesPerSec=8.21759169914901, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1031, Loss: 0.283130019903183 +[2024-01-21 18:19:29,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=0, lr=[1.9094682684059135e-05, 1.9094682684059135e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:19:29,634] [INFO] [timer.py:260:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=8.215537915596379, CurrSamplesPerSec=8.232482064821436, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1041, Loss: 0.09308959543704987 +[2024-01-21 18:20:08,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=0, lr=[1.90756220972152e-05, 1.90756220972152e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:20:08,616] [INFO] [timer.py:260:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=8.21549920354341, CurrSamplesPerSec=8.181742360240374, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1051, Loss: 0.06235988438129425 +[2024-01-21 18:20:47,562] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=0, lr=[1.9056372661136137e-05, 1.9056372661136137e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:20:47,563] [INFO] [timer.py:260:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=8.215532102502607, CurrSamplesPerSec=8.171262130697233, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1061, Loss: 0.0734567642211914 +[2024-01-21 18:21:26,581] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=0, lr=[1.903693477637204e-05, 1.903693477637204e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:21:26,581] [INFO] [timer.py:260:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=8.215421151429133, CurrSamplesPerSec=8.184340668001559, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1071, Loss: 0.13492776453495026 +[2024-01-21 18:22:05,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=0, lr=[1.9017308847394322e-05, 1.9017308847394322e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:22:05,556] [INFO] [timer.py:260:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=8.215398340440316, CurrSamplesPerSec=8.207430008491306, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1081, Loss: 0.2568672001361847 +[2024-01-21 18:22:44,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=0, lr=[1.8997495282587293e-05, 1.8997495282587293e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:22:44,527] [INFO] [timer.py:260:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=8.21538323109202, CurrSamplesPerSec=8.211811797897173, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1091, Loss: 0.03504878655076027 +[2024-01-21 18:23:23,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=0, lr=[1.897749449423967e-05, 1.897749449423967e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:23:23,483] [INFO] [timer.py:260:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=8.215397573794684, CurrSamplesPerSec=8.217233990096663, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1101, Loss: 0.0655258297920227 +[2024-01-21 18:24:02,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=0, lr=[1.895730689853598e-05, 1.895730689853598e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:24:02,402] [INFO] [timer.py:260:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=8.21548231581066, CurrSamplesPerSec=8.19995672069981, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1111, Loss: 0.07381191104650497 +[2024-01-21 18:24:41,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=0, lr=[1.8936932915547934e-05, 1.8936932915547934e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:24:41,353] [INFO] [timer.py:260:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=8.215504723783107, CurrSamplesPerSec=8.209492773572888, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1121, Loss: 0.1172158271074295 +[2024-01-21 18:25:20,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=0, lr=[1.891637296922565e-05, 1.891637296922565e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:25:20,329] [INFO] [timer.py:260:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=8.21547765451846, CurrSamplesPerSec=8.202332515917726, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1131, Loss: 0.08733037114143372 +[2024-01-21 18:25:59,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=0, lr=[1.8895627487388856e-05, 1.8895627487388856e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:25:59,244] [INFO] [timer.py:260:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=8.215566646583559, CurrSamplesPerSec=8.233213808011724, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1141, Loss: 0.12091915309429169 +[2024-01-21 18:26:38,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=0, lr=[1.8874696901717967e-05, 1.8874696901717967e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:26:38,180] [INFO] [timer.py:260:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=8.215615252022614, CurrSamplesPerSec=8.22906292795285, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1151, Loss: 0.048723094165325165 +[2024-01-21 18:27:17,162] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=0, lr=[1.8853581647745122e-05, 1.8853581647745122e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:27:17,163] [INFO] [timer.py:260:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=8.215577877262048, CurrSamplesPerSec=8.17723167204737, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1161, Loss: 0.28542560338974 +[2024-01-21 18:27:56,124] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=0, lr=[1.8832282164845117e-05, 1.8832282164845117e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:27:56,125] [INFO] [timer.py:260:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=8.21557799470625, CurrSamplesPerSec=8.219089786839547, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1171, Loss: 0.017982732504606247 +[2024-01-21 18:28:35,034] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=0, lr=[1.8810798896226253e-05, 1.8810798896226253e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:28:35,034] [INFO] [timer.py:260:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=8.21567212077206, CurrSamplesPerSec=8.234426091216433, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1181, Loss: 0.16741488873958588 +[2024-01-21 18:29:14,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=0, lr=[1.8789132288921116e-05, 1.8789132288921116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:29:14,027] [INFO] [timer.py:260:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=8.215616289662401, CurrSamplesPerSec=8.233810307761507, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1191, Loss: 0.18302080035209656 +[2024-01-21 18:29:53,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=0, lr=[1.8767282793777282e-05, 1.8767282793777282e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:29:53,021] [INFO] [timer.py:260:stop] epoch=0/micro_step=1200/global_step=1200, RunningAvgSamplesPerSec=8.215559735805817, CurrSamplesPerSec=8.22299785115252, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1201, Loss: 0.15817832946777344 +[2024-01-21 18:30:31,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=0, lr=[1.8745250865447933e-05, 1.8745250865447933e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:30:31,929] [INFO] [timer.py:260:stop] epoch=0/micro_step=1210/global_step=1210, RunningAvgSamplesPerSec=8.215655540793344, CurrSamplesPerSec=8.246773292413767, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1211, Loss: 0.09869858622550964 +[2024-01-21 18:31:10,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=0, lr=[1.872303696238239e-05, 1.872303696238239e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:31:10,936] [INFO] [timer.py:260:stop] epoch=0/micro_step=1220/global_step=1220, RunningAvgSamplesPerSec=8.215577439887916, CurrSamplesPerSec=8.174716541686767, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1221, Loss: 0.02718920074403286 +[2024-01-21 18:31:49,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=0, lr=[1.8700641546816584e-05, 1.8700641546816584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:31:49,911] [INFO] [timer.py:260:stop] epoch=0/micro_step=1230/global_step=1230, RunningAvgSamplesPerSec=8.215555416110234, CurrSamplesPerSec=8.220670994994872, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1231, Loss: 0.16207434237003326 +[2024-01-21 18:32:28,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=0, lr=[1.8678065084763425e-05, 1.8678065084763425e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:32:28,875] [INFO] [timer.py:260:stop] epoch=0/micro_step=1240/global_step=1240, RunningAvgSamplesPerSec=8.21555130632173, CurrSamplesPerSec=8.228484772018312, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1241, Loss: 0.17771464586257935 +[2024-01-21 18:33:07,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=0, lr=[1.865530804600312e-05, 1.865530804600312e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:33:07,808] [INFO] [timer.py:260:stop] epoch=0/micro_step=1250/global_step=1250, RunningAvgSamplesPerSec=8.215601829517068, CurrSamplesPerSec=8.200190180182403, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1251, Loss: 0.028592301532626152 +[2024-01-21 18:33:46,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=0, lr=[1.8632370904073385e-05, 1.8632370904073385e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:33:46,764] [INFO] [timer.py:260:stop] epoch=0/micro_step=1260/global_step=1260, RunningAvgSamplesPerSec=8.215611450498024, CurrSamplesPerSec=8.21310674190478, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1261, Loss: 0.21924546360969543 +[2024-01-21 18:34:25,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=0, lr=[1.8609254136259594e-05, 1.8609254136259594e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:34:25,731] [INFO] [timer.py:260:stop] epoch=0/micro_step=1270/global_step=1270, RunningAvgSamplesPerSec=8.215602565584987, CurrSamplesPerSec=8.228269371689088, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1271, Loss: 0.0925293117761612 +[2024-01-21 18:35:04,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=0, lr=[1.8585958223584856e-05, 1.8585958223584856e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:35:04,674] [INFO] [timer.py:260:stop] epoch=0/micro_step=1280/global_step=1280, RunningAvgSamplesPerSec=8.215633658355445, CurrSamplesPerSec=8.232989069727642, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1281, Loss: 0.026805948466062546 +[2024-01-21 18:35:43,644] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=0, lr=[1.8562483650799988e-05, 1.8562483650799988e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:35:43,645] [INFO] [timer.py:260:stop] epoch=0/micro_step=1290/global_step=1290, RunningAvgSamplesPerSec=8.215620109871953, CurrSamplesPerSec=8.229893980223025, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1291, Loss: 0.06829668581485748 +[2024-01-21 18:36:22,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=0, lr=[1.853883090637345e-05, 1.853883090637345e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:36:22,595] [INFO] [timer.py:260:stop] epoch=0/micro_step=1300/global_step=1300, RunningAvgSamplesPerSec=8.215639609000426, CurrSamplesPerSec=8.239961666523929, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1301, Loss: 0.13412977755069733 +[2024-01-21 18:37:01,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=0, lr=[1.8515000482481173e-05, 1.8515000482481173e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:37:01,589] [INFO] [timer.py:260:stop] epoch=0/micro_step=1310/global_step=1310, RunningAvgSamplesPerSec=8.215586272351628, CurrSamplesPerSec=8.187509962340599, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1311, Loss: 0.0903964638710022 +[2024-01-21 18:37:40,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=0, lr=[1.8490992874996298e-05, 1.8490992874996298e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:37:40,499] [INFO] [timer.py:260:stop] epoch=0/micro_step=1320/global_step=1320, RunningAvgSamplesPerSec=8.215670371802473, CurrSamplesPerSec=8.229105309070611, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1321, Loss: 0.16146157681941986 +[2024-01-21 18:38:19,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=0, lr=[1.8466808583478886e-05, 1.8466808583478886e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:38:19,400] [INFO] [timer.py:260:stop] epoch=0/micro_step=1330/global_step=1330, RunningAvgSamplesPerSec=8.21576569719394, CurrSamplesPerSec=8.231144659327443, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1331, Loss: 0.14543136954307556 +[2024-01-21 18:38:58,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=0, lr=[1.844244811116551e-05, 1.844244811116551e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:38:58,346] [INFO] [timer.py:260:stop] epoch=0/micro_step=1340/global_step=1340, RunningAvgSamplesPerSec=8.21579091206851, CurrSamplesPerSec=8.25018789441769, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1341, Loss: 0.1009281650185585 +[2024-01-21 18:39:37,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=0, lr=[1.841791196495879e-05, 1.841791196495879e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:39:37,260] [INFO] [timer.py:260:stop] epoch=0/micro_step=1350/global_step=1350, RunningAvgSamplesPerSec=8.215863614539524, CurrSamplesPerSec=8.21244891760514, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1351, Loss: 0.08579594641923904 +[2024-01-21 18:40:16,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=0, lr=[1.8393200655416824e-05, 1.8393200655416824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:40:16,234] [INFO] [timer.py:260:stop] epoch=0/micro_step=1360/global_step=1360, RunningAvgSamplesPerSec=8.215843189772825, CurrSamplesPerSec=8.195526517452883, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1361, Loss: 0.048801448196172714 +[2024-01-21 18:40:55,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=0, lr=[1.8368314696742597e-05, 1.8368314696742597e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:40:55,184] [INFO] [timer.py:260:stop] epoch=0/micro_step=1370/global_step=1370, RunningAvgSamplesPerSec=8.215858679838476, CurrSamplesPerSec=8.223820120404843, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1371, Loss: 0.15791672468185425 +[2024-01-21 18:41:34,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=0, lr=[1.834325460677325e-05, 1.834325460677325e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:41:34,126] [INFO] [timer.py:260:stop] epoch=0/micro_step=1380/global_step=1380, RunningAvgSamplesPerSec=8.215887198417182, CurrSamplesPerSec=8.229839479849643, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1381, Loss: 0.17684508860111237 +[2024-01-21 18:42:13,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=0, lr=[1.8318020906969335e-05, 1.8318020906969335e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:42:13,079] [INFO] [timer.py:260:stop] epoch=0/micro_step=1390/global_step=1390, RunningAvgSamplesPerSec=8.215898665970888, CurrSamplesPerSec=8.202839323202351, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1391, Loss: 0.14925755560398102 +[2024-01-21 18:42:52,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=0, lr=[1.8292614122403928e-05, 1.8292614122403928e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:42:52,050] [INFO] [timer.py:260:stop] epoch=0/micro_step=1400/global_step=1400, RunningAvgSamplesPerSec=8.215882301275276, CurrSamplesPerSec=8.201326105143123, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1401, Loss: 0.09458407014608383 +[2024-01-21 18:43:31,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=0, lr=[1.826703478175174e-05, 1.826703478175174e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:43:31,015] [INFO] [timer.py:260:stop] epoch=0/micro_step=1410/global_step=1410, RunningAvgSamplesPerSec=8.215875540590837, CurrSamplesPerSec=8.201566157792689, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1411, Loss: 0.10526854544878006 +[2024-01-21 18:44:10,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=0, lr=[1.8241283417278094e-05, 1.8241283417278094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:44:10,022] [INFO] [timer.py:260:stop] epoch=0/micro_step=1420/global_step=1420, RunningAvgSamplesPerSec=8.215806772116773, CurrSamplesPerSec=8.172744865444884, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1421, Loss: 0.21549783647060394 +[2024-01-21 18:44:49,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=0, lr=[1.821536056482785e-05, 1.821536056482785e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:44:49,013] [INFO] [timer.py:260:stop] epoch=0/micro_step=1430/global_step=1430, RunningAvgSamplesPerSec=8.215762232832882, CurrSamplesPerSec=8.20034198607678, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1431, Loss: 0.15206506848335266 +[2024-01-21 18:45:28,029] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=0, lr=[1.818926676381426e-05, 1.818926676381426e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:45:28,029] [INFO] [timer.py:260:stop] epoch=0/micro_step=1440/global_step=1440, RunningAvgSamplesPerSec=8.215680911290733, CurrSamplesPerSec=8.189147993258695, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1441, Loss: 0.045436691492795944 +[2024-01-21 18:46:07,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=0, lr=[1.8163002557207754e-05, 1.8163002557207754e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:46:07,084] [INFO] [timer.py:260:stop] epoch=0/micro_step=1450/global_step=1450, RunningAvgSamplesPerSec=8.215543914974978, CurrSamplesPerSec=8.171764608037556, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1451, Loss: 0.04045216366648674 +[2024-01-21 18:46:46,055] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=0, lr=[1.813656849152462e-05, 1.813656849152462e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:46:46,056] [INFO] [timer.py:260:stop] epoch=0/micro_step=1460/global_step=1460, RunningAvgSamplesPerSec=8.215530731563636, CurrSamplesPerSec=8.179549451673871, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1461, Loss: 0.22418972849845886 +[2024-01-21 18:47:24,997] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=0, lr=[1.8109965116815647e-05, 1.8109965116815647e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:47:24,997] [INFO] [timer.py:260:stop] epoch=0/micro_step=1470/global_step=1470, RunningAvgSamplesPerSec=8.215559799605726, CurrSamplesPerSec=8.236324031137048, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1471, Loss: 0.03669540584087372 +[2024-01-21 18:48:03,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=0, lr=[1.8083192986654668e-05, 1.8083192986654668e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:48:03,949] [INFO] [timer.py:260:stop] epoch=0/micro_step=1480/global_step=1480, RunningAvgSamplesPerSec=8.215575325720543, CurrSamplesPerSec=8.215260861288769, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1481, Loss: 0.18877367675304413 +[2024-01-21 18:48:42,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=0, lr=[1.8056252658127064e-05, 1.8056252658127064e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:48:42,923] [INFO] [timer.py:260:stop] epoch=0/micro_step=1490/global_step=1490, RunningAvgSamplesPerSec=8.215557749057835, CurrSamplesPerSec=8.25674418307078, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1491, Loss: 0.08001866191625595 +[2024-01-21 18:49:21,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=0, lr=[1.8029144691818138e-05, 1.8029144691818138e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:49:21,895] [INFO] [timer.py:260:stop] epoch=0/micro_step=1500/global_step=1500, RunningAvgSamplesPerSec=8.21554291682677, CurrSamplesPerSec=8.212542886053413, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1501, Loss: 0.058151766657829285 +[2024-01-21 18:50:00,836] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=0, lr=[1.800186965180148e-05, 1.800186965180148e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:50:00,836] [INFO] [timer.py:260:stop] epoch=0/micro_step=1510/global_step=1510, RunningAvgSamplesPerSec=8.215572676102457, CurrSamplesPerSec=8.216136908060227, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1511, Loss: 0.08694794774055481 +[2024-01-21 18:50:39,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=0, lr=[1.797442810562721e-05, 1.797442810562721e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:50:39,838] [INFO] [timer.py:260:stop] epoch=0/micro_step=1520/global_step=1520, RunningAvgSamplesPerSec=8.215517680564666, CurrSamplesPerSec=8.133268607045402, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1521, Loss: 0.10218695551156998 +[2024-01-21 18:51:18,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=0, lr=[1.7946820624310184e-05, 1.7946820624310184e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:51:18,804] [INFO] [timer.py:260:stop] epoch=0/micro_step=1530/global_step=1530, RunningAvgSamplesPerSec=8.215512137036919, CurrSamplesPerSec=8.225007965008992, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1531, Loss: 0.14216217398643494 +[2024-01-21 18:51:57,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=0, lr=[1.79190477823181e-05, 1.79190477823181e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:51:57,750] [INFO] [timer.py:260:stop] epoch=0/micro_step=1540/global_step=1540, RunningAvgSamplesPerSec=8.2155346281515, CurrSamplesPerSec=8.22616943119139, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1541, Loss: 0.039431583136320114 +[2024-01-21 18:52:36,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=0, lr=[1.7891110157559542e-05, 1.7891110157559542e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:52:36,683] [INFO] [timer.py:260:stop] epoch=0/micro_step=1550/global_step=1550, RunningAvgSamplesPerSec=8.215573766269802, CurrSamplesPerSec=8.229451437872967, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1551, Loss: 0.02895725518465042 +[2024-01-21 18:53:15,624] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=0, lr=[1.7863008331371974e-05, 1.7863008331371974e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:53:15,625] [INFO] [timer.py:260:stop] epoch=0/micro_step=1560/global_step=1560, RunningAvgSamplesPerSec=8.215601317224287, CurrSamplesPerSec=8.218980569516043, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1561, Loss: 0.23856620490550995 +[2024-01-21 18:53:54,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=0, lr=[1.783474288850962e-05, 1.783474288850962e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:53:54,570] [INFO] [timer.py:260:stop] epoch=0/micro_step=1570/global_step=1570, RunningAvgSamplesPerSec=8.215624358174468, CurrSamplesPerSec=8.196969512827815, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1571, Loss: 0.20846883952617645 +[2024-01-21 18:54:33,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=0, lr=[1.7806314417131303e-05, 1.7806314417131303e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:54:33,608] [INFO] [timer.py:260:stop] epoch=0/micro_step=1580/global_step=1580, RunningAvgSamplesPerSec=8.215521384860672, CurrSamplesPerSec=8.216279245585634, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1581, Loss: 0.08299479633569717 +[2024-01-21 18:55:12,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=0, lr=[1.7777723508788226e-05, 1.7777723508788226e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:55:12,540] [INFO] [timer.py:260:stop] epoch=0/micro_step=1590/global_step=1590, RunningAvgSamplesPerSec=8.215562248488798, CurrSamplesPerSec=8.238140930326624, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1591, Loss: 0.05795734003186226 +[2024-01-21 18:55:51,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=0, lr=[1.7748970758411627e-05, 1.7748970758411627e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:55:51,461] [INFO] [timer.py:260:stop] epoch=0/micro_step=1600/global_step=1600, RunningAvgSamplesPerSec=8.21561548352552, CurrSamplesPerSec=8.236266918475778, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1601, Loss: 0.04776901379227638 +[2024-01-21 18:56:30,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=0, lr=[1.7720056764300434e-05, 1.7720056764300434e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:56:30,461] [INFO] [timer.py:260:stop] epoch=0/micro_step=1610/global_step=1610, RunningAvgSamplesPerSec=8.215565866890984, CurrSamplesPerSec=8.173232593785961, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1611, Loss: 0.15657208859920502 +[2024-01-21 18:57:09,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=0, lr=[1.769098212810879e-05, 1.769098212810879e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:57:09,432] [INFO] [timer.py:260:stop] epoch=0/micro_step=1620/global_step=1620, RunningAvgSamplesPerSec=8.21555359686824, CurrSamplesPerSec=8.229459511195431, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1621, Loss: 0.1602790802717209 +[2024-01-21 18:57:48,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=0, lr=[1.766174745483355e-05, 1.766174745483355e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:57:48,371] [INFO] [timer.py:260:stop] epoch=0/micro_step=1630/global_step=1630, RunningAvgSamplesPerSec=8.21558314410142, CurrSamplesPerSec=8.21659160069162, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1631, Loss: 0.03186400607228279 +[2024-01-21 18:58:27,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=0, lr=[1.7632353352801686e-05, 1.7632353352801686e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:58:27,271] [INFO] [timer.py:260:stop] epoch=0/micro_step=1640/global_step=1640, RunningAvgSamplesPerSec=8.215663654068857, CurrSamplesPerSec=8.23987870437839, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1641, Loss: 0.06355573236942291 +[2024-01-21 18:59:06,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=0, lr=[1.760280043365762e-05, 1.760280043365762e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:59:06,218] [INFO] [timer.py:260:stop] epoch=0/micro_step=1650/global_step=1650, RunningAvgSamplesPerSec=8.215681105246457, CurrSamplesPerSec=8.241500816089411, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1651, Loss: 0.03886401653289795 +[2024-01-21 18:59:45,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=0, lr=[1.7573089312350517e-05, 1.7573089312350517e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 18:59:45,152] [INFO] [timer.py:260:stop] epoch=0/micro_step=1660/global_step=1660, RunningAvgSamplesPerSec=8.215716594397223, CurrSamplesPerSec=8.219626352805783, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1661, Loss: 0.05955221876502037 +[2024-01-21 19:00:24,106] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=0, lr=[1.7543220607121466e-05, 1.7543220607121466e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:00:24,107] [INFO] [timer.py:260:stop] epoch=0/micro_step=1670/global_step=1670, RunningAvgSamplesPerSec=8.215724331570996, CurrSamplesPerSec=8.19370985681155, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1671, Loss: 0.14705555140972137 +[2024-01-21 19:01:03,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=0, lr=[1.7513194939490633e-05, 1.7513194939490633e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:01:03,079] [INFO] [timer.py:260:stop] epoch=0/micro_step=1680/global_step=1680, RunningAvgSamplesPerSec=8.21571042077813, CurrSamplesPerSec=8.19743360161826, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1681, Loss: 0.08663471788167953 +[2024-01-21 19:01:42,057] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=0, lr=[1.748301293424432e-05, 1.748301293424432e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:01:42,058] [INFO] [timer.py:260:stop] epoch=0/micro_step=1690/global_step=1690, RunningAvgSamplesPerSec=8.215688520363289, CurrSamplesPerSec=8.218529135154474, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1691, Loss: 0.09708768874406815 +[2024-01-21 19:02:20,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=0, lr=[1.745267521942197e-05, 1.745267521942197e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:02:20,977] [INFO] [timer.py:260:stop] epoch=0/micro_step=1700/global_step=1700, RunningAvgSamplesPerSec=8.215740484412667, CurrSamplesPerSec=8.203234886940189, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1701, Loss: 0.03757631406188011 +[2024-01-21 19:02:59,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=0, lr=[1.742218242630308e-05, 1.742218242630308e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:02:59,933] [INFO] [timer.py:260:stop] epoch=0/micro_step=1710/global_step=1710, RunningAvgSamplesPerSec=8.21574653347623, CurrSamplesPerSec=8.205505729553764, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1711, Loss: 0.060404714196920395 +[2024-01-21 19:03:38,900] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=0, lr=[1.7391535189394094e-05, 1.7391535189394094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:03:38,901] [INFO] [timer.py:260:stop] epoch=0/micro_step=1720/global_step=1720, RunningAvgSamplesPerSec=8.215738160371695, CurrSamplesPerSec=8.19207000059815, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1721, Loss: 0.05243632569909096 +[2024-01-21 19:04:17,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=0, lr=[1.7360734146415182e-05, 1.7360734146415182e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:04:17,885] [INFO] [timer.py:260:stop] epoch=0/micro_step=1730/global_step=1730, RunningAvgSamplesPerSec=8.215709792654176, CurrSamplesPerSec=8.229412080652791, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1731, Loss: 0.11589611321687698 +[2024-01-21 19:04:56,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=0, lr=[1.7329779938286972e-05, 1.7329779938286972e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:04:56,884] [INFO] [timer.py:260:stop] epoch=0/micro_step=1740/global_step=1740, RunningAvgSamplesPerSec=8.215663717701796, CurrSamplesPerSec=8.189533742707122, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1741, Loss: 0.1844688504934311 +[2024-01-21 19:05:35,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=0, lr=[1.729867320911721e-05, 1.729867320911721e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:05:35,788] [INFO] [timer.py:260:stop] epoch=0/micro_step=1750/global_step=1750, RunningAvgSamplesPerSec=8.215733112842537, CurrSamplesPerSec=8.217714967570334, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1751, Loss: 0.2081376016139984 +[2024-01-21 19:06:14,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=0, lr=[1.7267414606187364e-05, 1.7267414606187364e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:06:14,750] [INFO] [timer.py:260:stop] epoch=0/micro_step=1760/global_step=1760, RunningAvgSamplesPerSec=8.21573281731473, CurrSamplesPerSec=8.2011602314597, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1761, Loss: 0.03743457421660423 +[2024-01-21 19:06:53,743] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=0, lr=[1.723600477993916e-05, 1.723600477993916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:06:53,744] [INFO] [timer.py:260:stop] epoch=0/micro_step=1770/global_step=1770, RunningAvgSamplesPerSec=8.215692719040492, CurrSamplesPerSec=8.153644428503807, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1771, Loss: 0.089599609375 +[2024-01-21 19:07:32,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=0, lr=[1.7204444383961032e-05, 1.7204444383961032e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:07:32,668] [INFO] [timer.py:260:stop] epoch=0/micro_step=1780/global_step=1780, RunningAvgSamplesPerSec=8.215737712993743, CurrSamplesPerSec=8.22363015757078, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1781, Loss: 0.021943964064121246 +[2024-01-21 19:08:11,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=0, lr=[1.7172734074974534e-05, 1.7172734074974534e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:08:11,582] [INFO] [timer.py:260:stop] epoch=0/micro_step=1790/global_step=1790, RunningAvgSamplesPerSec=8.215792146138137, CurrSamplesPerSec=8.232270494587997, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1791, Loss: 0.03867803141474724 +[2024-01-21 19:08:50,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=0, lr=[1.7140874512820674e-05, 1.7140874512820674e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:08:50,568] [INFO] [timer.py:260:stop] epoch=0/micro_step=1800/global_step=1800, RunningAvgSamplesPerSec=8.215762825688747, CurrSamplesPerSec=8.227753364780206, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1801, Loss: 0.023557770997285843 +[2024-01-21 19:09:29,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=0, lr=[1.7108866360446172e-05, 1.7108866360446172e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:09:29,557] [INFO] [timer.py:260:stop] epoch=0/micro_step=1810/global_step=1810, RunningAvgSamplesPerSec=8.215729766801354, CurrSamplesPerSec=8.232321492623294, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1811, Loss: 0.20333200693130493 +[2024-01-21 19:10:08,501] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=0, lr=[1.7076710283889678e-05, 1.7076710283889678e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:10:08,502] [INFO] [timer.py:260:stop] epoch=0/micro_step=1820/global_step=1820, RunningAvgSamplesPerSec=8.215749051202073, CurrSamplesPerSec=8.202186651243867, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1821, Loss: 0.10607556253671646 +[2024-01-21 19:10:47,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=0, lr=[1.704440695226791e-05, 1.704440695226791e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:10:47,424] [INFO] [timer.py:260:stop] epoch=0/micro_step=1830/global_step=1830, RunningAvgSamplesPerSec=8.215794166931538, CurrSamplesPerSec=8.217338130024954, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1831, Loss: 0.1680225431919098 +[2024-01-21 19:11:26,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=0, lr=[1.701195703776173e-05, 1.701195703776173e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:11:26,397] [INFO] [timer.py:260:stop] epoch=0/micro_step=1840/global_step=1840, RunningAvgSamplesPerSec=8.215780007772834, CurrSamplesPerSec=8.202680907866725, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1841, Loss: 0.11539898067712784 +[2024-01-21 19:12:05,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=1850, skipped=0, lr=[1.6979361215602156e-05, 1.6979361215602156e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:12:05,370] [INFO] [timer.py:260:stop] epoch=0/micro_step=1850/global_step=1850, RunningAvgSamplesPerSec=8.215765678875654, CurrSamplesPerSec=8.212827819574176, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1851, Loss: 0.11162851005792618 +[2024-01-21 19:12:44,326] [INFO] [logging.py:96:log_dist] [Rank 0] step=1860, skipped=0, lr=[1.6946620164056305e-05, 1.6946620164056305e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:12:44,326] [INFO] [timer.py:260:stop] epoch=0/micro_step=1860/global_step=1860, RunningAvgSamplesPerSec=8.215771409606774, CurrSamplesPerSec=8.17434662435245, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1861, Loss: 0.053211405873298645 +[2024-01-21 19:13:23,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=1870, skipped=0, lr=[1.6913734564413296e-05, 1.6913734564413296e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:13:23,281] [INFO] [timer.py:260:stop] epoch=0/micro_step=1870/global_step=1870, RunningAvgSamplesPerSec=8.2157785470567, CurrSamplesPerSec=8.20955855412926, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1871, Loss: 0.13979095220565796 +[2024-01-21 19:14:02,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=1880, skipped=0, lr=[1.6880705100970057e-05, 1.6880705100970057e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:14:02,263] [INFO] [timer.py:260:stop] epoch=0/micro_step=1880/global_step=1880, RunningAvgSamplesPerSec=8.215755477803848, CurrSamplesPerSec=8.188622393180353, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1881, Loss: 0.08738209307193756 +[2024-01-21 19:14:41,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=1890, skipped=0, lr=[1.6847532461017094e-05, 1.6847532461017094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:14:41,200] [INFO] [timer.py:260:stop] epoch=0/micro_step=1890/global_step=1890, RunningAvgSamplesPerSec=8.21578120462298, CurrSamplesPerSec=8.229979769310166, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1891, Loss: 0.04025859013199806 +[2024-01-21 19:15:20,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=0, lr=[1.681421733482419e-05, 1.681421733482419e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:15:20,167] [INFO] [timer.py:260:stop] epoch=0/micro_step=1900/global_step=1900, RunningAvgSamplesPerSec=8.2157744523808, CurrSamplesPerSec=8.216304897042239, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1901, Loss: 0.08802089095115662 +[2024-01-21 19:15:59,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=1910, skipped=0, lr=[1.678076041562604e-05, 1.678076041562604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:15:59,172] [INFO] [timer.py:260:stop] epoch=0/micro_step=1910/global_step=1910, RunningAvgSamplesPerSec=8.21572624031192, CurrSamplesPerSec=8.17057070350949, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1911, Loss: 0.0605689138174057 +[2024-01-21 19:16:38,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=1920, skipped=0, lr=[1.6747162399607817e-05, 1.6747162399607817e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:16:38,133] [INFO] [timer.py:260:stop] epoch=0/micro_step=1920/global_step=1920, RunningAvgSamplesPerSec=8.215726134796304, CurrSamplesPerSec=8.236911377200897, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1921, Loss: 0.26290255784988403 +[2024-01-21 19:17:17,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=1930, skipped=0, lr=[1.671342398589071e-05, 1.671342398589071e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:17:17,074] [INFO] [timer.py:260:stop] epoch=0/micro_step=1930/global_step=1930, RunningAvgSamplesPerSec=8.215747846087243, CurrSamplesPerSec=8.211169251262913, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1931, Loss: 0.1625266820192337 +[2024-01-21 19:17:56,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=1940, skipped=0, lr=[1.667954587651734e-05, 1.667954587651734e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:17:56,013] [INFO] [timer.py:260:stop] epoch=0/micro_step=1940/global_step=1940, RunningAvgSamplesPerSec=8.215771976932134, CurrSamplesPerSec=8.210503198828757, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1941, Loss: 0.10711091756820679 +[2024-01-21 19:18:34,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=1950, skipped=0, lr=[1.664552877643719e-05, 1.664552877643719e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:18:34,961] [INFO] [timer.py:260:stop] epoch=0/micro_step=1950/global_step=1950, RunningAvgSamplesPerSec=8.215785865508112, CurrSamplesPerSec=8.222762083688819, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1951, Loss: 0.0866369754076004 +[2024-01-21 19:19:13,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=1960, skipped=0, lr=[1.6611373393491915e-05, 1.6611373393491915e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:19:13,914] [INFO] [timer.py:260:stop] epoch=0/micro_step=1960/global_step=1960, RunningAvgSamplesPerSec=8.215794299099892, CurrSamplesPerSec=8.206674238800042, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1961, Loss: 0.06272883713245392 +[2024-01-21 19:19:52,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=1970, skipped=0, lr=[1.6577080438400604e-05, 1.6577080438400604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:19:52,850] [INFO] [timer.py:260:stop] epoch=0/micro_step=1970/global_step=1970, RunningAvgSamplesPerSec=8.215820355444967, CurrSamplesPerSec=8.22371934327263, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1971, Loss: 0.21708062291145325 +[2024-01-21 19:20:31,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=1980, skipped=0, lr=[1.6542650624745013e-05, 1.6542650624745013e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:20:31,790] [INFO] [timer.py:260:stop] epoch=0/micro_step=1980/global_step=1980, RunningAvgSamplesPerSec=8.215843046589152, CurrSamplesPerSec=8.205354735680032, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1981, Loss: 0.025712214410305023 +[2024-01-21 19:21:10,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=1990, skipped=0, lr=[1.650808466895471e-05, 1.650808466895471e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:21:10,762] [INFO] [timer.py:260:stop] epoch=0/micro_step=1990/global_step=1990, RunningAvgSamplesPerSec=8.215830193179888, CurrSamplesPerSec=8.19203850018094, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 1991, Loss: 0.059649981558322906 +[2024-01-21 19:21:49,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=0, lr=[1.6473383290292158e-05, 1.6473383290292158e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:21:49,719] [INFO] [timer.py:260:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=8.215833378432965, CurrSamplesPerSec=8.217005595825333, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2001, Loss: 0.059304554015398026 +[2024-01-21 19:22:28,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=2010, skipped=0, lr=[1.6438547210837753e-05, 1.6438547210837753e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:22:28,659] [INFO] [timer.py:260:stop] epoch=0/micro_step=2010/global_step=2010, RunningAvgSamplesPerSec=8.215854793190239, CurrSamplesPerSec=8.230482933079314, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2011, Loss: 0.07000162452459335 +[2024-01-21 19:23:07,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=2020, skipped=0, lr=[1.64035771554748e-05, 1.64035771554748e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:23:07,605] [INFO] [timer.py:260:stop] epoch=0/micro_step=2020/global_step=2020, RunningAvgSamplesPerSec=8.215869344736806, CurrSamplesPerSec=8.234770141599705, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2021, Loss: 0.1127006933093071 +[2024-01-21 19:23:46,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=2030, skipped=0, lr=[1.6368473851874432e-05, 1.6368473851874432e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:23:46,616] [INFO] [timer.py:260:stop] epoch=0/micro_step=2030/global_step=2030, RunningAvgSamplesPerSec=8.215816753701143, CurrSamplesPerSec=8.177493733040949, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2031, Loss: 0.03800923377275467 +[2024-01-21 19:24:25,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=2040, skipped=0, lr=[1.6333238030480473e-05, 1.6333238030480473e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:24:25,620] [INFO] [timer.py:260:stop] epoch=0/micro_step=2040/global_step=2040, RunningAvgSamplesPerSec=8.215771328496436, CurrSamplesPerSec=8.197679434769535, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2041, Loss: 0.18091914057731628 +[2024-01-21 19:25:04,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=2050, skipped=0, lr=[1.629787042449421e-05, 1.629787042449421e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:25:04,587] [INFO] [timer.py:260:stop] epoch=0/micro_step=2050/global_step=2050, RunningAvgSamplesPerSec=8.21576548983545, CurrSamplesPerSec=8.222217552826436, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2051, Loss: 0.05575120076537132 +[2024-01-21 19:25:43,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=2060, skipped=0, lr=[1.6262371769859182e-05, 1.6262371769859182e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:25:43,540] [INFO] [timer.py:260:stop] epoch=0/micro_step=2060/global_step=2060, RunningAvgSamplesPerSec=8.2157741345655, CurrSamplesPerSec=8.230918519702321, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2061, Loss: 0.04879733547568321 +[2024-01-21 19:26:22,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=2070, skipped=0, lr=[1.6226742805245824e-05, 1.6226742805245824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:26:22,510] [INFO] [timer.py:260:stop] epoch=0/micro_step=2070/global_step=2070, RunningAvgSamplesPerSec=8.215764384133283, CurrSamplesPerSec=8.205810744095974, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2071, Loss: 0.11875536292791367 +[2024-01-21 19:27:01,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=2080, skipped=0, lr=[1.6190984272036118e-05, 1.6190984272036118e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:27:01,469] [INFO] [timer.py:260:stop] epoch=0/micro_step=2080/global_step=2080, RunningAvgSamplesPerSec=8.215765686193961, CurrSamplesPerSec=8.213961216497111, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2081, Loss: 0.15719068050384521 +[2024-01-21 19:27:40,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=2090, skipped=0, lr=[1.615509691430817e-05, 1.615509691430817e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:27:40,486] [INFO] [timer.py:260:stop] epoch=0/micro_step=2090/global_step=2090, RunningAvgSamplesPerSec=8.215709250205897, CurrSamplesPerSec=8.197446118224907, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2091, Loss: 0.16572962701320648 +[2024-01-21 19:28:19,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=0, lr=[1.6119081478820706e-05, 1.6119081478820706e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:28:19,435] [INFO] [timer.py:260:stop] epoch=0/micro_step=2100/global_step=2100, RunningAvgSamplesPerSec=8.21572053701086, CurrSamplesPerSec=8.243910876299196, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2101, Loss: 0.2102108895778656 +[2024-01-21 19:28:58,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=2110, skipped=0, lr=[1.608293871499756e-05, 1.608293871499756e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:28:58,341] [INFO] [timer.py:260:stop] epoch=0/micro_step=2110/global_step=2110, RunningAvgSamplesPerSec=8.215775869236719, CurrSamplesPerSec=8.215595768706804, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2111, Loss: 0.0784444585442543 +[2024-01-21 19:29:37,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=2120, skipped=0, lr=[1.604666937491205e-05, 1.604666937491205e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:29:37,308] [INFO] [timer.py:260:stop] epoch=0/micro_step=2120/global_step=2120, RunningAvgSamplesPerSec=8.21576979369537, CurrSamplesPerSec=8.235326947768785, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2121, Loss: 0.0855594128370285 +[2024-01-21 19:30:16,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=2130, skipped=0, lr=[1.6010274213271363e-05, 1.6010274213271363e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:30:16,244] [INFO] [timer.py:260:stop] epoch=0/micro_step=2130/global_step=2130, RunningAvgSamplesPerSec=8.215794378615898, CurrSamplesPerSec=8.220975123871035, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2131, Loss: 0.06934328377246857 +[2024-01-21 19:30:55,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=2140, skipped=0, lr=[1.5973753987400815e-05, 1.5973753987400815e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:30:55,185] [INFO] [timer.py:260:stop] epoch=0/micro_step=2140/global_step=2140, RunningAvgSamplesPerSec=8.215813532871877, CurrSamplesPerSec=8.211605810308082, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2141, Loss: 0.2370654195547104 +[2024-01-21 19:31:34,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=2150, skipped=0, lr=[1.5937109457228122e-05, 1.5937109457228122e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:31:34,192] [INFO] [timer.py:260:stop] epoch=0/micro_step=2150/global_step=2150, RunningAvgSamplesPerSec=8.21576827013737, CurrSamplesPerSec=8.226962582439581, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2151, Loss: 0.02988426946103573 +[2024-01-21 19:32:13,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=2160, skipped=0, lr=[1.5900341385267566e-05, 1.5900341385267566e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:32:13,173] [INFO] [timer.py:260:stop] epoch=0/micro_step=2160/global_step=2160, RunningAvgSamplesPerSec=8.21574835597002, CurrSamplesPerSec=8.238671895261149, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2161, Loss: 0.08292420953512192 +[2024-01-21 19:32:52,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=2170, skipped=0, lr=[1.586345053660414e-05, 1.586345053660414e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:32:52,095] [INFO] [timer.py:260:stop] epoch=0/micro_step=2170/global_step=2170, RunningAvgSamplesPerSec=8.215785874113333, CurrSamplesPerSec=8.246186059028012, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2171, Loss: 0.09025851637125015 +[2024-01-21 19:33:31,009] [INFO] [logging.py:96:log_dist] [Rank 0] step=2180, skipped=0, lr=[1.582643767887762e-05, 1.582643767887762e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:33:31,010] [INFO] [timer.py:260:stop] epoch=0/micro_step=2180/global_step=2180, RunningAvgSamplesPerSec=8.215829714555293, CurrSamplesPerSec=8.231762063581042, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2181, Loss: 0.03064866177737713 +[2024-01-21 19:34:09,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=2190, skipped=0, lr=[1.5789303582266612e-05, 1.5789303582266612e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:34:09,982] [INFO] [timer.py:260:stop] epoch=0/micro_step=2190/global_step=2190, RunningAvgSamplesPerSec=8.215817951060082, CurrSamplesPerSec=8.192250007629628, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2191, Loss: 0.042166680097579956 +[2024-01-21 19:34:48,965] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=0, lr=[1.5752049019472486e-05, 1.5752049019472486e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:34:48,966] [INFO] [timer.py:260:stop] epoch=0/micro_step=2200/global_step=2200, RunningAvgSamplesPerSec=8.215795638581056, CurrSamplesPerSec=8.214984306678353, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2201, Loss: 0.15927480161190033 +[2024-01-21 19:35:27,926] [INFO] [logging.py:96:log_dist] [Rank 0] step=2210, skipped=0, lr=[1.571467476570334e-05, 1.571467476570334e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:35:27,926] [INFO] [timer.py:260:stop] epoch=0/micro_step=2210/global_step=2210, RunningAvgSamplesPerSec=8.215796176481364, CurrSamplesPerSec=8.216806390043379, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2211, Loss: 0.12761157751083374 +[2024-01-21 19:36:06,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=2220, skipped=0, lr=[1.5677181598657843e-05, 1.5677181598657843e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:36:06,910] [INFO] [timer.py:260:stop] epoch=0/micro_step=2220/global_step=2220, RunningAvgSamplesPerSec=8.215773879608143, CurrSamplesPerSec=8.201087569874943, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2221, Loss: 0.053722627460956573 +[2024-01-21 19:36:45,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=2230, skipped=0, lr=[1.5639570298509067e-05, 1.5639570298509067e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:36:45,860] [INFO] [timer.py:260:stop] epoch=0/micro_step=2230/global_step=2230, RunningAvgSamplesPerSec=8.215783993565331, CurrSamplesPerSec=8.24837176093673, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2231, Loss: 0.1492067575454712 +[2024-01-21 19:37:24,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=2240, skipped=0, lr=[1.5601841647888233e-05, 1.5601841647888233e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:37:24,860] [INFO] [timer.py:260:stop] epoch=0/micro_step=2240/global_step=2240, RunningAvgSamplesPerSec=8.215747204219577, CurrSamplesPerSec=8.17725359285556, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2241, Loss: 0.07994047552347183 +[2024-01-21 19:38:03,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=2250, skipped=0, lr=[1.5563996431868443e-05, 1.5563996431868443e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:38:03,849] [INFO] [timer.py:260:stop] epoch=0/micro_step=2250/global_step=2250, RunningAvgSamplesPerSec=8.215720382823587, CurrSamplesPerSec=8.202771645016023, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2251, Loss: 0.08422773331403732 +[2024-01-21 19:38:42,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=2260, skipped=0, lr=[1.552603543794835e-05, 1.552603543794835e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:38:42,801] [INFO] [timer.py:260:stop] epoch=0/micro_step=2260/global_step=2260, RunningAvgSamplesPerSec=8.215728044824793, CurrSamplesPerSec=8.230574791080167, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2261, Loss: 0.1763874739408493 +[2024-01-21 19:39:21,799] [INFO] [logging.py:96:log_dist] [Rank 0] step=2270, skipped=0, lr=[1.5487959456035745e-05, 1.5487959456035745e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:39:21,800] [INFO] [timer.py:260:stop] epoch=0/micro_step=2270/global_step=2270, RunningAvgSamplesPerSec=8.21569263010244, CurrSamplesPerSec=8.171652664666276, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2271, Loss: 0.21936160326004028 +[2024-01-21 19:40:00,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=2280, skipped=0, lr=[1.5449769278431145e-05, 1.5449769278431145e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:40:00,729] [INFO] [timer.py:260:stop] epoch=0/micro_step=2280/global_step=2280, RunningAvgSamplesPerSec=8.21572189729655, CurrSamplesPerSec=8.2367036230204, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2281, Loss: 0.13190844655036926 +[2024-01-21 19:40:39,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=2290, skipped=0, lr=[1.5411465699811293e-05, 1.5411465699811293e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:40:39,651] [INFO] [timer.py:260:stop] epoch=0/micro_step=2290/global_step=2290, RunningAvgSamplesPerSec=8.215757262421743, CurrSamplesPerSec=8.208379183388827, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2291, Loss: 0.12495028972625732 +[2024-01-21 19:41:18,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=0, lr=[1.5373049517212633e-05, 1.5373049517212633e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:41:18,608] [INFO] [timer.py:260:stop] epoch=0/micro_step=2300/global_step=2300, RunningAvgSamplesPerSec=8.215760884672264, CurrSamplesPerSec=8.188179782332998, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2301, Loss: 0.04025004431605339 +[2024-01-21 19:41:57,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=2310, skipped=0, lr=[1.5334521530014713e-05, 1.5334521530014713e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:41:57,593] [INFO] [timer.py:260:stop] epoch=0/micro_step=2310/global_step=2310, RunningAvgSamplesPerSec=8.215738398637345, CurrSamplesPerSec=8.220004910528614, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2311, Loss: 0.07860980927944183 +[2024-01-21 19:42:36,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=2320, skipped=0, lr=[1.529588253992356e-05, 1.529588253992356e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:42:36,591] [INFO] [timer.py:260:stop] epoch=0/micro_step=2320/global_step=2320, RunningAvgSamplesPerSec=8.215704164524267, CurrSamplesPerSec=8.207532394373777, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 0, Total Step: 2321, Loss: 0.03918910026550293 +***** Evaluating perplexity, Epoch 1/3 ***** +ppl: 1.0075013637542725 +eval loss: 0.007473226636648178 +Beginning of Epoch 2/3, Total Micro Batches 2329 +Epoch: 1, Total Step: 2330, Loss: 0.061899974942207336 +[2024-01-21 19:43:13,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=2330, skipped=0, lr=[1.5257133350954987e-05, 1.5257133350954987e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:43:13,263] [INFO] [timer.py:260:stop] epoch=1/micro_step=1/global_step=2330, RunningAvgSamplesPerSec=8.218150912851243, CurrSamplesPerSec=8.25073258610749, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2340, Loss: 0.025106582790613174 +[2024-01-21 19:43:52,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=2340, skipped=0, lr=[1.5218274769417875e-05, 1.5218274769417875e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:43:52,158] [INFO] [timer.py:260:stop] epoch=1/micro_step=11/global_step=2340, RunningAvgSamplesPerSec=8.218198929745254, CurrSamplesPerSec=8.246114623916558, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2350, Loss: 0.1114315390586853 +[2024-01-21 19:44:31,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=2350, skipped=0, lr=[1.5179307603897394e-05, 1.5179307603897394e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:44:31,039] [INFO] [timer.py:260:stop] epoch=1/micro_step=21/global_step=2350, RunningAvgSamplesPerSec=8.218260451044545, CurrSamplesPerSec=8.204169551550985, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2360, Loss: 0.04444192722439766 +[2024-01-21 19:45:09,967] [INFO] [logging.py:96:log_dist] [Rank 0] step=2360, skipped=0, lr=[1.5140232665238171e-05, 1.5140232665238171e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:45:09,968] [INFO] [timer.py:260:stop] epoch=1/micro_step=31/global_step=2360, RunningAvgSamplesPerSec=8.218277517825147, CurrSamplesPerSec=8.220268713386128, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2370, Loss: 0.1439174860715866 +[2024-01-21 19:45:48,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=2370, skipped=0, lr=[1.5101050766527414e-05, 1.5101050766527414e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:45:48,920] [INFO] [timer.py:260:stop] epoch=1/micro_step=41/global_step=2370, RunningAvgSamplesPerSec=8.218273596265398, CurrSamplesPerSec=8.172524909056134, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2380, Loss: 0.1552136242389679 +[2024-01-21 19:46:27,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=2380, skipped=0, lr=[1.5061762723078007e-05, 1.5061762723078007e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:46:27,902] [INFO] [timer.py:260:stop] epoch=1/micro_step=51/global_step=2380, RunningAvgSamplesPerSec=8.218243955300322, CurrSamplesPerSec=8.190024976276778, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2390, Loss: 0.13993076980113983 +[2024-01-21 19:47:06,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=2390, skipped=0, lr=[1.5022369352411535e-05, 1.5022369352411535e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:47:06,905] [INFO] [timer.py:260:stop] epoch=1/micro_step=61/global_step=2390, RunningAvgSamplesPerSec=8.218195708871802, CurrSamplesPerSec=8.231162327008972, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2400, Loss: 0.20101605355739594 +[2024-01-21 19:47:45,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=0, lr=[1.498287147424127e-05, 1.498287147424127e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:47:45,872] [INFO] [timer.py:260:stop] epoch=1/micro_step=71/global_step=2400, RunningAvgSamplesPerSec=8.218179523695687, CurrSamplesPerSec=8.214624813519903, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2410, Loss: 0.02727091684937477 +[2024-01-21 19:48:24,873] [INFO] [logging.py:96:log_dist] [Rank 0] step=2410, skipped=0, lr=[1.4943269910455127e-05, 1.4943269910455127e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:48:24,874] [INFO] [timer.py:260:stop] epoch=1/micro_step=81/global_step=2410, RunningAvgSamplesPerSec=8.218132933047974, CurrSamplesPerSec=8.217488559025949, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2420, Loss: 0.057129353284835815 +[2024-01-21 19:49:03,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=2420, skipped=0, lr=[1.4903565485098547e-05, 1.4903565485098547e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:49:03,793] [INFO] [timer.py:260:stop] epoch=1/micro_step=91/global_step=2420, RunningAvgSamplesPerSec=8.218158827357525, CurrSamplesPerSec=8.228041372385988, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2430, Loss: 0.03352535143494606 +[2024-01-21 19:49:42,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=2430, skipped=0, lr=[1.4863759024357358e-05, 1.4863759024357358e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:49:42,729] [INFO] [timer.py:260:stop] epoch=1/micro_step=101/global_step=2430, RunningAvgSamplesPerSec=8.218169794681193, CurrSamplesPerSec=8.231244104122892, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2440, Loss: 0.06345923244953156 +[2024-01-21 19:50:21,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=2440, skipped=0, lr=[1.4823851356540584e-05, 1.4823851356540584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:50:21,643] [INFO] [timer.py:260:stop] epoch=1/micro_step=111/global_step=2440, RunningAvgSamplesPerSec=8.218200576415109, CurrSamplesPerSec=8.22829408918685, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2450, Loss: 0.09876459091901779 +[2024-01-21 19:51:00,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=2450, skipped=0, lr=[1.4783843312063204e-05, 1.4783843312063204e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:51:00,608] [INFO] [timer.py:260:stop] epoch=1/micro_step=121/global_step=2450, RunningAvgSamplesPerSec=8.218186754805359, CurrSamplesPerSec=8.214380476690613, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2460, Loss: 0.024412862956523895 +[2024-01-21 19:51:39,523] [INFO] [logging.py:96:log_dist] [Rank 0] step=2460, skipped=0, lr=[1.4743735723428873e-05, 1.4743735723428873e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:51:39,524] [INFO] [timer.py:260:stop] epoch=1/micro_step=131/global_step=2460, RunningAvgSamplesPerSec=8.218214829945586, CurrSamplesPerSec=8.233274918745488, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2470, Loss: 0.0556696355342865 +[2024-01-21 19:52:18,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=2470, skipped=0, lr=[1.470352942521261e-05, 1.470352942521261e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:52:18,493] [INFO] [timer.py:260:stop] epoch=1/micro_step=141/global_step=2470, RunningAvgSamplesPerSec=8.218196655027631, CurrSamplesPerSec=8.193331716452693, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2480, Loss: 0.1555713266134262 +[2024-01-21 19:52:57,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=2480, skipped=0, lr=[1.4663225254043416e-05, 1.4663225254043416e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:52:57,493] [INFO] [timer.py:260:stop] epoch=1/micro_step=151/global_step=2480, RunningAvgSamplesPerSec=8.218153250044852, CurrSamplesPerSec=8.227347868111089, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2490, Loss: 0.17374421656131744 +[2024-01-21 19:53:36,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=2490, skipped=0, lr=[1.462282404858687e-05, 1.462282404858687e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:53:36,435] [INFO] [timer.py:260:stop] epoch=1/micro_step=161/global_step=2490, RunningAvgSamplesPerSec=8.218159506363282, CurrSamplesPerSec=8.22251524762675, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2500, Loss: 0.0419662706553936 +[2024-01-21 19:54:15,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=0, lr=[1.4582326649527692e-05, 1.4582326649527692e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:54:15,427] [INFO] [timer.py:260:stop] epoch=1/micro_step=171/global_step=2500, RunningAvgSamplesPerSec=8.218122641153904, CurrSamplesPerSec=8.203561293351042, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2510, Loss: 0.03231734409928322 +[2024-01-21 19:54:54,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=2510, skipped=0, lr=[1.4541733899552221e-05, 1.4541733899552221e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:54:54,361] [INFO] [timer.py:260:stop] epoch=1/micro_step=181/global_step=2510, RunningAvgSamplesPerSec=8.218135524959578, CurrSamplesPerSec=8.236092552861889, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2520, Loss: 0.11430536955595016 +[2024-01-21 19:55:33,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=2520, skipped=0, lr=[1.4501046643330913e-05, 1.4501046643330913e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:55:33,260] [INFO] [timer.py:260:stop] epoch=1/micro_step=191/global_step=2520, RunningAvgSamplesPerSec=8.218176848913172, CurrSamplesPerSec=8.239326340489326, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2530, Loss: 0.11850032210350037 +[2024-01-21 19:56:12,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=2530, skipped=0, lr=[1.4460265727500736e-05, 1.4460265727500736e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:56:12,207] [INFO] [timer.py:260:stop] epoch=1/micro_step=201/global_step=2530, RunningAvgSamplesPerSec=8.218178683089752, CurrSamplesPerSec=8.22213494759939, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2540, Loss: 0.1906827986240387 +[2024-01-21 19:56:51,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=2540, skipped=0, lr=[1.441939200064757e-05, 1.441939200064757e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:56:51,086] [INFO] [timer.py:260:stop] epoch=1/micro_step=211/global_step=2540, RunningAvgSamplesPerSec=8.218236874199418, CurrSamplesPerSec=8.241827744358027, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2550, Loss: 0.10113878548145294 +[2024-01-21 19:57:30,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=2550, skipped=0, lr=[1.4378426313288546e-05, 1.4378426313288546e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:57:30,025] [INFO] [timer.py:260:stop] epoch=1/micro_step=221/global_step=2550, RunningAvgSamplesPerSec=8.21824459606778, CurrSamplesPerSec=8.215113530787887, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2560, Loss: 0.06021902337670326 +[2024-01-21 19:58:08,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=2560, skipped=0, lr=[1.4337369517854344e-05, 1.4337369517854344e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:58:08,970] [INFO] [timer.py:260:stop] epoch=1/micro_step=231/global_step=2560, RunningAvgSamplesPerSec=8.218246700633461, CurrSamplesPerSec=8.22639278889488, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2570, Loss: 0.1318262219429016 +[2024-01-21 19:58:47,968] [INFO] [logging.py:96:log_dist] [Rank 0] step=2570, skipped=0, lr=[1.4296222468671458e-05, 1.4296222468671458e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:58:47,969] [INFO] [timer.py:260:stop] epoch=1/micro_step=241/global_step=2570, RunningAvgSamplesPerSec=8.218205310028917, CurrSamplesPerSec=8.183309728639612, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2580, Loss: 0.027352459728717804 +[2024-01-21 19:59:26,938] [INFO] [logging.py:96:log_dist] [Rank 0] step=2580, skipped=0, lr=[1.425498602194442e-05, 1.425498602194442e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 19:59:26,938] [INFO] [timer.py:260:stop] epoch=1/micro_step=251/global_step=2580, RunningAvgSamplesPerSec=8.218188452341165, CurrSamplesPerSec=8.214858103398182, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2590, Loss: 0.2660071849822998 +[2024-01-21 20:00:05,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=2590, skipped=0, lr=[1.4213661035737984e-05, 1.4213661035737984e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:00:05,860] [INFO] [timer.py:260:stop] epoch=1/micro_step=261/global_step=2590, RunningAvgSamplesPerSec=8.21821060028981, CurrSamplesPerSec=8.23530976750533, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2600, Loss: 0.23992766439914703 +[2024-01-21 20:00:44,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=0, lr=[1.4172248369959266e-05, 1.4172248369959266e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:00:44,821] [INFO] [timer.py:260:stop] epoch=1/micro_step=271/global_step=2600, RunningAvgSamplesPerSec=8.218200505195636, CurrSamplesPerSec=8.247934325126023, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2610, Loss: 0.2195226103067398 +[2024-01-21 20:01:23,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=2610, skipped=0, lr=[1.4130748886339851e-05, 1.4130748886339851e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:01:23,729] [INFO] [timer.py:260:stop] epoch=1/micro_step=281/global_step=2610, RunningAvgSamplesPerSec=8.218233579665492, CurrSamplesPerSec=8.20477489059198, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2620, Loss: 0.13749432563781738 +[2024-01-21 20:02:02,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=2620, skipped=0, lr=[1.408916344841788e-05, 1.408916344841788e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:02:02,662] [INFO] [timer.py:260:stop] epoch=1/micro_step=291/global_step=2620, RunningAvgSamplesPerSec=8.218246547411786, CurrSamplesPerSec=8.199845506479466, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2630, Loss: 0.09911809116601944 +[2024-01-21 20:02:41,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=2630, skipped=0, lr=[1.4047492921520046e-05, 1.4047492921520046e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:02:41,557] [INFO] [timer.py:260:stop] epoch=1/micro_step=301/global_step=2630, RunningAvgSamplesPerSec=8.218289346353751, CurrSamplesPerSec=8.245227614399402, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2640, Loss: 0.06678574532270432 +[2024-01-21 20:03:20,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=2640, skipped=0, lr=[1.400573817274362e-05, 1.400573817274362e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:03:20,494] [INFO] [timer.py:260:stop] epoch=1/micro_step=311/global_step=2640, RunningAvgSamplesPerSec=8.218298561023596, CurrSamplesPerSec=8.227240448451653, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2650, Loss: 0.04844808205962181 +[2024-01-21 20:03:59,456] [INFO] [logging.py:96:log_dist] [Rank 0] step=2650, skipped=0, lr=[1.3963900070938398e-05, 1.3963900070938398e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:03:59,457] [INFO] [timer.py:260:stop] epoch=1/micro_step=321/global_step=2650, RunningAvgSamplesPerSec=8.21828652451738, CurrSamplesPerSec=8.210685523708312, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2660, Loss: 0.12183418869972229 +[2024-01-21 20:04:38,406] [INFO] [logging.py:96:log_dist] [Rank 0] step=2660, skipped=0, lr=[1.3921979486688613e-05, 1.3921979486688613e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:04:38,406] [INFO] [timer.py:260:stop] epoch=1/micro_step=331/global_step=2660, RunningAvgSamplesPerSec=8.218286074139034, CurrSamplesPerSec=8.20811664504829, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2670, Loss: 0.17465780675411224 +[2024-01-21 20:05:17,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=2670, skipped=0, lr=[1.3879977292294825e-05, 1.3879977292294825e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:05:17,297] [INFO] [timer.py:260:stop] epoch=1/micro_step=341/global_step=2670, RunningAvgSamplesPerSec=8.218331637699727, CurrSamplesPerSec=8.250417122751406, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2680, Loss: 0.03797774389386177 +[2024-01-21 20:05:56,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=2680, skipped=0, lr=[1.3837894361755782e-05, 1.3837894361755782e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:05:56,295] [INFO] [timer.py:260:stop] epoch=1/micro_step=351/global_step=2680, RunningAvgSamplesPerSec=8.218292358691668, CurrSamplesPerSec=8.188531469214091, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2690, Loss: 0.019382795318961143 +[2024-01-21 20:06:35,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=2690, skipped=0, lr=[1.3795731570750208e-05, 1.3795731570750208e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:06:35,207] [INFO] [timer.py:260:stop] epoch=1/micro_step=361/global_step=2690, RunningAvgSamplesPerSec=8.218320205978406, CurrSamplesPerSec=8.223152014112054, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2700, Loss: 0.155758336186409 +[2024-01-21 20:07:14,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=0, lr=[1.3753489796618608e-05, 1.3753489796618608e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:07:14,060] [INFO] [timer.py:260:stop] epoch=1/micro_step=371/global_step=2700, RunningAvgSamplesPerSec=8.21839485951636, CurrSamplesPerSec=8.243838974253014, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2710, Loss: 0.03352022543549538 +[2024-01-21 20:07:53,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=2710, skipped=0, lr=[1.3711169918344995e-05, 1.3711169918344995e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:07:53,008] [INFO] [timer.py:260:stop] epoch=1/micro_step=381/global_step=2710, RunningAvgSamplesPerSec=8.21839474323275, CurrSamplesPerSec=8.228214892830621, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2720, Loss: 0.1418105661869049 +[2024-01-21 20:08:31,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=2720, skipped=0, lr=[1.3668772816538604e-05, 1.3668772816538604e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:08:31,993] [INFO] [timer.py:260:stop] epoch=1/micro_step=391/global_step=2720, RunningAvgSamplesPerSec=8.218365871563387, CurrSamplesPerSec=8.243963031385142, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2730, Loss: 0.06328684091567993 +[2024-01-21 20:09:10,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=2730, skipped=0, lr=[1.362629937341557e-05, 1.362629937341557e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:09:10,907] [INFO] [timer.py:260:stop] epoch=1/micro_step=401/global_step=2730, RunningAvgSamplesPerSec=8.218392033059114, CurrSamplesPerSec=8.251274813414874, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2740, Loss: 0.04991483315825462 +[2024-01-21 20:09:49,851] [INFO] [logging.py:96:log_dist] [Rank 0] step=2740, skipped=0, lr=[1.3583750472780567e-05, 1.3583750472780567e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:09:49,852] [INFO] [timer.py:260:stop] epoch=1/micro_step=411/global_step=2740, RunningAvgSamplesPerSec=8.2183941304044, CurrSamplesPerSec=8.233555231668818, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2750, Loss: 0.12184983491897583 +[2024-01-21 20:10:28,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=2750, skipped=0, lr=[1.3541127000008427e-05, 1.3541127000008427e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:10:28,814] [INFO] [timer.py:260:stop] epoch=1/micro_step=421/global_step=2750, RunningAvgSamplesPerSec=8.21838277388314, CurrSamplesPerSec=8.222322322819034, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2760, Loss: 0.10980482399463654 +[2024-01-21 20:11:07,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=2760, skipped=0, lr=[1.34984298420257e-05, 1.34984298420257e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:11:07,768] [INFO] [timer.py:260:stop] epoch=1/micro_step=431/global_step=2760, RunningAvgSamplesPerSec=8.218378028895895, CurrSamplesPerSec=8.182099978639126, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2770, Loss: 0.05907022953033447 +[2024-01-21 20:11:46,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=2770, skipped=0, lr=[1.3455659887292212e-05, 1.3455659887292212e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:11:46,643] [INFO] [timer.py:260:stop] epoch=1/micro_step=441/global_step=2770, RunningAvgSamplesPerSec=8.218433420160725, CurrSamplesPerSec=8.19713421577895, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2780, Loss: 0.09509871900081635 +[2024-01-21 20:12:25,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=2780, skipped=0, lr=[1.3412818025782574e-05, 1.3412818025782574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:12:25,679] [INFO] [timer.py:260:stop] epoch=1/micro_step=451/global_step=2780, RunningAvgSamplesPerSec=8.218366034728778, CurrSamplesPerSec=8.205795191819059, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2790, Loss: 0.04740368202328682 +[2024-01-21 20:13:04,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=2790, skipped=0, lr=[1.3369905148967658e-05, 1.3369905148967658e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:13:04,639] [INFO] [timer.py:260:stop] epoch=1/micro_step=461/global_step=2790, RunningAvgSamplesPerSec=8.218357304862492, CurrSamplesPerSec=8.210901009006022, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2800, Loss: 0.031013239175081253 +[2024-01-21 20:13:43,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=0, lr=[1.3326922149796064e-05, 1.3326922149796064e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:13:43,599] [INFO] [timer.py:260:stop] epoch=1/micro_step=471/global_step=2800, RunningAvgSamplesPerSec=8.218348123582613, CurrSamplesPerSec=8.227111346844813, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2810, Loss: 0.221916064620018 +[2024-01-21 20:14:22,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=2810, skipped=0, lr=[1.3283869922675507e-05, 1.3283869922675507e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:14:22,569] [INFO] [timer.py:260:stop] epoch=1/micro_step=481/global_step=2810, RunningAvgSamplesPerSec=8.218331358881633, CurrSamplesPerSec=8.176743964462302, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2820, Loss: 0.047555066645145416 +[2024-01-21 20:15:01,568] [INFO] [logging.py:96:log_dist] [Rank 0] step=2820, skipped=0, lr=[1.3240749363454242e-05, 1.3240749363454242e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:15:01,569] [INFO] [timer.py:260:stop] epoch=1/micro_step=491/global_step=2820, RunningAvgSamplesPerSec=8.21829305993986, CurrSamplesPerSec=8.203133610968452, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2830, Loss: 0.16547687351703644 +[2024-01-21 20:15:40,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=2830, skipped=0, lr=[1.3197561369402397e-05, 1.3197561369402397e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:15:40,480] [INFO] [timer.py:260:stop] epoch=1/micro_step=501/global_step=2830, RunningAvgSamplesPerSec=8.218320512675126, CurrSamplesPerSec=8.248893399163345, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2840, Loss: 0.0429571159183979 +[2024-01-21 20:16:19,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=2840, skipped=0, lr=[1.3154306839193315e-05, 1.3154306839193315e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:16:19,397] [INFO] [timer.py:260:stop] epoch=1/micro_step=511/global_step=2840, RunningAvgSamplesPerSec=8.218344185212361, CurrSamplesPerSec=8.214005452963068, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2850, Loss: 0.02933369390666485 +[2024-01-21 20:16:58,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=2850, skipped=0, lr=[1.3110986672884854e-05, 1.3110986672884854e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:16:58,354] [INFO] [timer.py:260:stop] epoch=1/micro_step=521/global_step=2850, RunningAvgSamplesPerSec=8.218337269505435, CurrSamplesPerSec=8.248037217209426, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2860, Loss: 0.17257879674434662 +[2024-01-21 20:17:37,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=2860, skipped=0, lr=[1.306760177190064e-05, 1.306760177190064e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:17:37,344] [INFO] [timer.py:260:stop] epoch=1/micro_step=531/global_step=2860, RunningAvgSamplesPerSec=8.218306661168013, CurrSamplesPerSec=8.236566136965454, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2870, Loss: 0.15120241045951843 +[2024-01-21 20:18:16,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=2870, skipped=0, lr=[1.3024153039011345e-05, 1.3024153039011345e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:18:16,301] [INFO] [timer.py:260:stop] epoch=1/micro_step=541/global_step=2870, RunningAvgSamplesPerSec=8.218300211286154, CurrSamplesPerSec=8.215648069026813, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2880, Loss: 0.10071621090173721 +[2024-01-21 20:18:55,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=2880, skipped=0, lr=[1.2980641378315866e-05, 1.2980641378315866e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:18:55,293] [INFO] [timer.py:260:stop] epoch=1/micro_step=551/global_step=2880, RunningAvgSamplesPerSec=8.218268379802568, CurrSamplesPerSec=8.217275746369264, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2890, Loss: 0.02944914996623993 +[2024-01-21 20:19:34,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=2890, skipped=0, lr=[1.2937067695222535e-05, 1.2937067695222535e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:19:34,240] [INFO] [timer.py:260:stop] epoch=1/micro_step=561/global_step=2890, RunningAvgSamplesPerSec=8.218269345045266, CurrSamplesPerSec=8.221510928816143, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2900, Loss: 0.05026421695947647 +[2024-01-21 20:20:13,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=0, lr=[1.2893432896430267e-05, 1.2893432896430267e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:20:13,139] [INFO] [timer.py:260:stop] epoch=1/micro_step=571/global_step=2900, RunningAvgSamplesPerSec=8.218305690797552, CurrSamplesPerSec=8.231682800453381, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2910, Loss: 0.07260092347860336 +[2024-01-21 20:20:52,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=2910, skipped=0, lr=[1.2849737889909699e-05, 1.2849737889909699e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:20:52,101] [INFO] [timer.py:260:stop] epoch=1/micro_step=581/global_step=2910, RunningAvgSamplesPerSec=8.21829616295887, CurrSamplesPerSec=8.199212845137755, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2920, Loss: 0.2692094147205353 +[2024-01-21 20:21:31,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=2920, skipped=0, lr=[1.2805983584884296e-05, 1.2805983584884296e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:21:31,027] [INFO] [timer.py:260:stop] epoch=1/micro_step=591/global_step=2920, RunningAvgSamplesPerSec=8.218312115761917, CurrSamplesPerSec=8.24292309244679, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2930, Loss: 0.08829235285520554 +[2024-01-21 20:22:10,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=2930, skipped=0, lr=[1.276217089181143e-05, 1.276217089181143e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:22:10,001] [INFO] [timer.py:260:stop] epoch=1/micro_step=601/global_step=2930, RunningAvgSamplesPerSec=8.21829319696136, CurrSamplesPerSec=8.137395414631627, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2940, Loss: 0.09388755261898041 +[2024-01-21 20:22:48,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=2940, skipped=0, lr=[1.2718300722363431e-05, 1.2718300722363431e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:22:48,955] [INFO] [timer.py:260:stop] epoch=1/micro_step=611/global_step=2940, RunningAvgSamplesPerSec=8.21828930235555, CurrSamplesPerSec=8.197806612899335, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2950, Loss: 0.13387511670589447 +[2024-01-21 20:23:27,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=2950, skipped=0, lr=[1.2674373989408626e-05, 1.2674373989408626e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:23:27,865] [INFO] [timer.py:260:stop] epoch=1/micro_step=621/global_step=2950, RunningAvgSamplesPerSec=8.218317058417924, CurrSamplesPerSec=8.244156973076617, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2960, Loss: 0.024763012304902077 +[2024-01-21 20:24:06,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=2960, skipped=0, lr=[1.2630391606992337e-05, 1.2630391606992337e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:24:06,764] [INFO] [timer.py:260:stop] epoch=1/micro_step=631/global_step=2960, RunningAvgSamplesPerSec=8.218352079031806, CurrSamplesPerSec=8.239761346147601, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2970, Loss: 0.11963682621717453 +[2024-01-21 20:24:45,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=2970, skipped=0, lr=[1.2586354490317862e-05, 1.2586354490317862e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:24:45,650] [INFO] [timer.py:260:stop] epoch=1/micro_step=641/global_step=2970, RunningAvgSamplesPerSec=8.21839628758745, CurrSamplesPerSec=8.203903270785712, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2980, Loss: 0.11034701764583588 +[2024-01-21 20:25:24,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=2980, skipped=0, lr=[1.2542263555727435e-05, 1.2542263555727435e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:25:24,615] [INFO] [timer.py:260:stop] epoch=1/micro_step=651/global_step=2980, RunningAvgSamplesPerSec=8.218384184778353, CurrSamplesPerSec=8.223671475000064, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 2990, Loss: 0.07028486579656601 +[2024-01-21 20:26:03,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=2990, skipped=0, lr=[1.249811972068315e-05, 1.249811972068315e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:26:03,529] [INFO] [timer.py:260:stop] epoch=1/micro_step=661/global_step=2990, RunningAvgSamplesPerSec=8.218407845789983, CurrSamplesPerSec=8.244843184691238, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3000, Loss: 0.09946417063474655 +[2024-01-21 20:26:42,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=0, lr=[1.2453923903747875e-05, 1.2453923903747875e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:26:42,450] [INFO] [timer.py:260:stop] epoch=1/micro_step=671/global_step=3000, RunningAvgSamplesPerSec=8.218426892857826, CurrSamplesPerSec=8.201548115769054, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3010, Loss: 0.04099642485380173 +[2024-01-21 20:27:21,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=3010, skipped=0, lr=[1.2409677024566145e-05, 1.2409677024566145e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:27:21,433] [INFO] [timer.py:260:stop] epoch=1/micro_step=681/global_step=3010, RunningAvgSamplesPerSec=8.218401829863744, CurrSamplesPerSec=8.236470607078417, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3020, Loss: 0.04020826518535614 +[2024-01-21 20:28:00,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=3020, skipped=0, lr=[1.2365380003845012e-05, 1.2365380003845012e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:28:00,388] [INFO] [timer.py:260:stop] epoch=1/micro_step=691/global_step=3020, RunningAvgSamplesPerSec=8.218396928100761, CurrSamplesPerSec=8.193568300182458, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3030, Loss: 0.17250002920627594 +[2024-01-21 20:28:39,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=3030, skipped=0, lr=[1.2321033763334896e-05, 1.2321033763334896e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:28:39,322] [INFO] [timer.py:260:stop] epoch=1/micro_step=701/global_step=3030, RunningAvgSamplesPerSec=8.218405748574426, CurrSamplesPerSec=8.229171404304097, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3040, Loss: 0.0667734295129776 +[2024-01-21 20:29:18,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=3040, skipped=0, lr=[1.2276639225810402e-05, 1.2276639225810402e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:29:18,242] [INFO] [timer.py:260:stop] epoch=1/micro_step=711/global_step=3040, RunningAvgSamplesPerSec=8.21842542188671, CurrSamplesPerSec=8.226176489714256, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3050, Loss: 0.13149453699588776 +[2024-01-21 20:29:57,165] [INFO] [logging.py:96:log_dist] [Rank 0] step=3050, skipped=0, lr=[1.2232197315051123e-05, 1.2232197315051123e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:29:57,166] [INFO] [timer.py:260:stop] epoch=1/micro_step=721/global_step=3050, RunningAvgSamplesPerSec=8.218441846255722, CurrSamplesPerSec=8.198084015108625, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3060, Loss: 0.1920345425605774 +[2024-01-21 20:30:36,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=3060, skipped=0, lr=[1.2187708955822405e-05, 1.2187708955822405e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:30:36,148] [INFO] [timer.py:260:stop] epoch=1/micro_step=731/global_step=3060, RunningAvgSamplesPerSec=8.218417448504834, CurrSamplesPerSec=8.213503296533295, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3070, Loss: 0.11446835845708847 +[2024-01-21 20:31:15,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=3070, skipped=0, lr=[1.2143175073856124e-05, 1.2143175073856124e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:31:15,060] [INFO] [timer.py:260:stop] epoch=1/micro_step=741/global_step=3070, RunningAvgSamplesPerSec=8.218441803787309, CurrSamplesPerSec=8.236631846639689, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3080, Loss: 0.09256202727556229 +[2024-01-21 20:31:54,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=3080, skipped=0, lr=[1.20985965958314e-05, 1.20985965958314e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:31:54,030] [INFO] [timer.py:260:stop] epoch=1/micro_step=751/global_step=3080, RunningAvgSamplesPerSec=8.21842644055949, CurrSamplesPerSec=8.20397598227344, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3090, Loss: 0.049752332270145416 +[2024-01-21 20:32:32,912] [INFO] [logging.py:96:log_dist] [Rank 0] step=3090, skipped=0, lr=[1.2053974449355333e-05, 1.2053974449355333e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:32:32,912] [INFO] [timer.py:260:stop] epoch=1/micro_step=761/global_step=3090, RunningAvgSamplesPerSec=8.218471224406839, CurrSamplesPerSec=8.253602787504441, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3100, Loss: 0.0715722069144249 +[2024-01-21 20:33:11,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=0, lr=[1.2009309562943692e-05, 1.2009309562943692e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:33:11,890] [INFO] [timer.py:260:stop] epoch=1/micro_step=771/global_step=3100, RunningAvgSamplesPerSec=8.218450281949917, CurrSamplesPerSec=8.152543461075076, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3110, Loss: 0.09606291353702545 +[2024-01-21 20:33:50,836] [INFO] [logging.py:96:log_dist] [Rank 0] step=3110, skipped=0, lr=[1.1964602866001596e-05, 1.1964602866001596e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:33:50,837] [INFO] [timer.py:260:stop] epoch=1/micro_step=781/global_step=3110, RunningAvgSamplesPerSec=8.218450812371893, CurrSamplesPerSec=8.249879571970169, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3120, Loss: 0.1497686803340912 +[2024-01-21 20:34:29,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=3120, skipped=0, lr=[1.1919855288804174e-05, 1.1919855288804174e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:34:29,815] [INFO] [timer.py:260:stop] epoch=1/micro_step=791/global_step=3120, RunningAvgSamplesPerSec=8.218430131145617, CurrSamplesPerSec=8.207324112055169, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3130, Loss: 0.16443923115730286 +[2024-01-21 20:35:08,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=3130, skipped=0, lr=[1.187506776247721e-05, 1.187506776247721e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:35:08,771] [INFO] [timer.py:260:stop] epoch=1/micro_step=801/global_step=3130, RunningAvgSamplesPerSec=8.218424138759756, CurrSamplesPerSec=8.241688568936917, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3140, Loss: 0.19555141031742096 +[2024-01-21 20:35:47,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=3140, skipped=0, lr=[1.1830241218977762e-05, 1.1830241218977762e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:35:47,684] [INFO] [timer.py:260:stop] epoch=1/micro_step=811/global_step=3140, RunningAvgSamplesPerSec=8.218447647053512, CurrSamplesPerSec=8.22632925928441, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3150, Loss: 0.07511782646179199 +[2024-01-21 20:36:26,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=3150, skipped=0, lr=[1.178537659107478e-05, 1.178537659107478e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:36:26,626] [INFO] [timer.py:260:stop] epoch=1/micro_step=821/global_step=3150, RunningAvgSamplesPerSec=8.218450763534449, CurrSamplesPerSec=8.20923066609964, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3160, Loss: 0.030895177274942398 +[2024-01-21 20:37:05,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=3160, skipped=0, lr=[1.1740474812329682e-05, 1.1740474812329682e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:37:05,551] [INFO] [timer.py:260:stop] epoch=1/micro_step=831/global_step=3160, RunningAvgSamplesPerSec=8.218466019194993, CurrSamplesPerSec=8.202059337177811, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3170, Loss: 0.144735187292099 +[2024-01-21 20:37:44,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=3170, skipped=0, lr=[1.1695536817076936e-05, 1.1695536817076936e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:37:44,529] [INFO] [timer.py:260:stop] epoch=1/micro_step=841/global_step=3170, RunningAvgSamplesPerSec=8.2184462119832, CurrSamplesPerSec=8.221431359380517, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3180, Loss: 0.05198684707283974 +[2024-01-21 20:38:23,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=3180, skipped=0, lr=[1.1650563540404625e-05, 1.1650563540404625e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:38:23,484] [INFO] [timer.py:260:stop] epoch=1/micro_step=851/global_step=3180, RunningAvgSamplesPerSec=8.218440963946774, CurrSamplesPerSec=8.231557093272812, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3190, Loss: 0.04907825216650963 +[2024-01-21 20:39:02,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=3190, skipped=0, lr=[1.1605555918134978e-05, 1.1605555918134978e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:39:02,427] [INFO] [timer.py:260:stop] epoch=1/micro_step=861/global_step=3190, RunningAvgSamplesPerSec=8.218443939000691, CurrSamplesPerSec=8.211880630200005, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3200, Loss: 0.1634242683649063 +[2024-01-21 20:39:41,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=0, lr=[1.15605148868049e-05, 1.15605148868049e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:39:41,374] [INFO] [timer.py:260:stop] epoch=1/micro_step=871/global_step=3200, RunningAvgSamplesPerSec=8.218444772366015, CurrSamplesPerSec=8.19174400799975, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3210, Loss: 0.07039367407560349 +[2024-01-21 20:40:20,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=3210, skipped=0, lr=[1.151544138364649e-05, 1.151544138364649e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:40:20,325] [INFO] [timer.py:260:stop] epoch=1/micro_step=881/global_step=3210, RunningAvgSamplesPerSec=8.218442322460879, CurrSamplesPerSec=8.23815458285335, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3220, Loss: 0.14986221492290497 +[2024-01-21 20:40:59,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=3220, skipped=0, lr=[1.1470336346567523e-05, 1.1470336346567523e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:40:59,284] [INFO] [timer.py:260:stop] epoch=1/micro_step=891/global_step=3220, RunningAvgSamplesPerSec=8.218434866562202, CurrSamplesPerSec=8.21062625467736, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3230, Loss: 0.10038311779499054 +[2024-01-21 20:41:38,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=3230, skipped=0, lr=[1.1425200714131957e-05, 1.1425200714131957e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:41:38,303] [INFO] [timer.py:260:stop] epoch=1/micro_step=901/global_step=3230, RunningAvgSamplesPerSec=8.218388542847755, CurrSamplesPerSec=8.107402736098425, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3240, Loss: 0.12128898501396179 +[2024-01-21 20:42:17,232] [INFO] [logging.py:96:log_dist] [Rank 0] step=3240, skipped=0, lr=[1.1380035425540383e-05, 1.1380035425540383e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:42:17,233] [INFO] [timer.py:260:stop] epoch=1/micro_step=911/global_step=3240, RunningAvgSamplesPerSec=8.21840008237945, CurrSamplesPerSec=8.220969081356387, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3250, Loss: 0.1367516815662384 +[2024-01-21 20:42:56,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=3250, skipped=0, lr=[1.13348414206105e-05, 1.13348414206105e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:42:56,143] [INFO] [timer.py:260:stop] epoch=1/micro_step=921/global_step=3250, RunningAvgSamplesPerSec=8.218424829777708, CurrSamplesPerSec=8.207211694184405, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3260, Loss: 0.0428396575152874 +[2024-01-21 20:43:35,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=3260, skipped=0, lr=[1.128961963975753e-05, 1.128961963975753e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:43:35,115] [INFO] [timer.py:260:stop] epoch=1/micro_step=931/global_step=3260, RunningAvgSamplesPerSec=8.218408643273381, CurrSamplesPerSec=8.208744656687692, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3270, Loss: 0.07879356294870377 +[2024-01-21 20:44:14,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=3270, skipped=0, lr=[1.1244371023974686e-05, 1.1244371023974686e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:44:14,049] [INFO] [timer.py:260:stop] epoch=1/micro_step=941/global_step=3270, RunningAvgSamplesPerSec=8.218417247501966, CurrSamplesPerSec=8.23585502318038, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3280, Loss: 0.11567846685647964 +[2024-01-21 20:44:53,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=3280, skipped=0, lr=[1.1199096514813559e-05, 1.1199096514813559e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:44:53,022] [INFO] [timer.py:260:stop] epoch=1/micro_step=951/global_step=3280, RunningAvgSamplesPerSec=8.218401124579858, CurrSamplesPerSec=8.191433039241547, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3290, Loss: 0.09279868751764297 +[2024-01-21 20:45:31,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=3290, skipped=0, lr=[1.1153797054364553e-05, 1.1153797054364553e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:45:31,950] [INFO] [timer.py:260:stop] epoch=1/micro_step=961/global_step=3290, RunningAvgSamplesPerSec=8.218414313635561, CurrSamplesPerSec=8.239113912558206, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3300, Loss: 0.04692619666457176 +[2024-01-21 20:46:10,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=0, lr=[1.1108473585237254e-05, 1.1108473585237254e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:46:10,896] [INFO] [timer.py:260:stop] epoch=1/micro_step=971/global_step=3300, RunningAvgSamplesPerSec=8.218415192633513, CurrSamplesPerSec=8.19936461484815, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3310, Loss: 0.13408496975898743 +[2024-01-21 20:46:49,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=3310, skipped=0, lr=[1.1063127050540843e-05, 1.1063127050540843e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:46:49,872] [INFO] [timer.py:260:stop] epoch=1/micro_step=981/global_step=3310, RunningAvgSamplesPerSec=8.218396878975875, CurrSamplesPerSec=8.241096999329072, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3320, Loss: 0.12346500903367996 +[2024-01-21 20:47:28,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=3320, skipped=0, lr=[1.1017758393864452e-05, 1.1017758393864452e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:47:28,873] [INFO] [timer.py:260:stop] epoch=1/micro_step=991/global_step=3320, RunningAvgSamplesPerSec=8.218363271180188, CurrSamplesPerSec=8.190990624385337, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3330, Loss: 0.2408159375190735 +[2024-01-21 20:48:07,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=3330, skipped=0, lr=[1.0972368559257538e-05, 1.0972368559257538e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:48:07,858] [INFO] [timer.py:260:stop] epoch=1/micro_step=1001/global_step=3330, RunningAvgSamplesPerSec=8.218339533405569, CurrSamplesPerSec=8.203906780965855, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3340, Loss: 0.03907247260212898 +[2024-01-21 20:48:46,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=3340, skipped=0, lr=[1.0926958491210238e-05, 1.0926958491210238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:48:46,802] [INFO] [timer.py:260:stop] epoch=1/micro_step=1011/global_step=3340, RunningAvgSamplesPerSec=8.218341965791243, CurrSamplesPerSec=8.175738344654809, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3350, Loss: 0.020410774275660515 +[2024-01-21 20:49:25,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=3350, skipped=0, lr=[1.0881529134633712e-05, 1.0881529134633712e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:49:25,720] [INFO] [timer.py:260:stop] epoch=1/micro_step=1021/global_step=3350, RunningAvgSamplesPerSec=8.218360266855708, CurrSamplesPerSec=8.212221794193416, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3360, Loss: 0.20841538906097412 +[2024-01-21 20:50:04,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=3360, skipped=0, lr=[1.0836081434840488e-05, 1.0836081434840488e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:50:04,646] [INFO] [timer.py:260:stop] epoch=1/micro_step=1031/global_step=3360, RunningAvgSamplesPerSec=8.218374212609799, CurrSamplesPerSec=8.230567725007306, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3370, Loss: 0.08473961800336838 +[2024-01-21 20:50:43,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=3370, skipped=0, lr=[1.0790616337524783e-05, 1.0790616337524783e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:50:43,570] [INFO] [timer.py:260:stop] epoch=1/micro_step=1041/global_step=3370, RunningAvgSamplesPerSec=8.218389335008423, CurrSamplesPerSec=8.230291652943661, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3380, Loss: 0.05112446844577789 +[2024-01-21 20:51:22,532] [INFO] [logging.py:96:log_dist] [Rank 0] step=3380, skipped=0, lr=[1.0745134788742826e-05, 1.0745134788742826e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:51:22,533] [INFO] [timer.py:260:stop] epoch=1/micro_step=1051/global_step=3380, RunningAvgSamplesPerSec=8.218379964998752, CurrSamplesPerSec=8.229238005155286, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3390, Loss: 0.060620326548814774 +[2024-01-21 20:52:01,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=3390, skipped=0, lr=[1.0699637734893183e-05, 1.0699637734893183e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:52:01,467] [INFO] [timer.py:260:stop] epoch=1/micro_step=1061/global_step=3390, RunningAvgSamplesPerSec=8.218388110816019, CurrSamplesPerSec=8.248944096386076, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3400, Loss: 0.09953106194734573 +[2024-01-21 20:52:40,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=0, lr=[1.065412612269705e-05, 1.065412612269705e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:52:40,461] [INFO] [timer.py:260:stop] epoch=1/micro_step=1071/global_step=3400, RunningAvgSamplesPerSec=8.218359592337091, CurrSamplesPerSec=8.223916868775511, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3410, Loss: 0.2202194184064865 +[2024-01-21 20:53:19,411] [INFO] [logging.py:96:log_dist] [Rank 0] step=3410, skipped=0, lr=[1.0608600899178563e-05, 1.0608600899178563e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:53:19,412] [INFO] [timer.py:260:stop] epoch=1/micro_step=1081/global_step=3410, RunningAvgSamplesPerSec=8.218357628922165, CurrSamplesPerSec=8.196537011362063, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3420, Loss: 0.023680180311203003 +[2024-01-21 20:53:58,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=3420, skipped=0, lr=[1.0563063011645081e-05, 1.0563063011645081e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:53:58,338] [INFO] [timer.py:260:stop] epoch=1/micro_step=1091/global_step=3420, RunningAvgSamplesPerSec=8.218371044448956, CurrSamplesPerSec=8.227324669261712, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3430, Loss: 0.05181693285703659 +[2024-01-21 20:54:37,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=3430, skipped=0, lr=[1.0517513407667487e-05, 1.0517513407667487e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:54:37,253] [INFO] [timer.py:260:stop] epoch=1/micro_step=1101/global_step=3430, RunningAvgSamplesPerSec=8.218391114823472, CurrSamplesPerSec=8.194176077887871, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3440, Loss: 0.05882517993450165 +[2024-01-21 20:55:16,155] [INFO] [logging.py:96:log_dist] [Rank 0] step=3440, skipped=0, lr=[1.0471953035060468e-05, 1.0471953035060468e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:55:16,156] [INFO] [timer.py:260:stop] epoch=1/micro_step=1111/global_step=3440, RunningAvgSamplesPerSec=8.218419016582173, CurrSamplesPerSec=8.227496142108219, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3450, Loss: 0.10017658025026321 +[2024-01-21 20:55:55,090] [INFO] [logging.py:96:log_dist] [Rank 0] step=3450, skipped=0, lr=[1.0426382841862776e-05, 1.0426382841862776e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:55:55,091] [INFO] [timer.py:260:stop] epoch=1/micro_step=1121/global_step=3450, RunningAvgSamplesPerSec=8.218426514427547, CurrSamplesPerSec=8.230112998261541, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3460, Loss: 0.0782790407538414 +[2024-01-21 20:56:34,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=3460, skipped=0, lr=[1.0380803776317528e-05, 1.0380803776317528e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:56:34,041] [INFO] [timer.py:260:stop] epoch=1/micro_step=1131/global_step=3460, RunningAvgSamplesPerSec=8.218425176923208, CurrSamplesPerSec=8.221005840124516, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3470, Loss: 0.11397960782051086 +[2024-01-21 20:57:12,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=3470, skipped=0, lr=[1.0335216786852448e-05, 1.0335216786852448e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:57:12,941] [INFO] [timer.py:260:stop] epoch=1/micro_step=1141/global_step=3470, RunningAvgSamplesPerSec=8.218453918618671, CurrSamplesPerSec=8.211614853448765, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3480, Loss: 0.0419963039457798 +[2024-01-21 20:57:51,825] [INFO] [logging.py:96:log_dist] [Rank 0] step=3480, skipped=0, lr=[1.0289622822060157e-05, 1.0289622822060157e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:57:51,826] [INFO] [timer.py:260:stop] epoch=1/micro_step=1151/global_step=3480, RunningAvgSamplesPerSec=8.218491891096047, CurrSamplesPerSec=8.215016989434815, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3490, Loss: 0.22996075451374054 +[2024-01-21 20:58:30,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=3490, skipped=0, lr=[1.024402283067841e-05, 1.024402283067841e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:58:30,779] [INFO] [timer.py:260:stop] epoch=1/micro_step=1161/global_step=3490, RunningAvgSamplesPerSec=8.218487854809277, CurrSamplesPerSec=8.204027633285616, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3500, Loss: 0.015949329361319542 +[2024-01-21 20:59:09,721] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=0, lr=[1.0198417761570374e-05, 1.0198417761570374e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:59:09,722] [INFO] [timer.py:260:stop] epoch=1/micro_step=1171/global_step=3500, RunningAvgSamplesPerSec=8.21849074638791, CurrSamplesPerSec=8.236623253776548, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3510, Loss: 0.12217298895120621 +[2024-01-21 20:59:48,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=3510, skipped=0, lr=[1.015280856370487e-05, 1.015280856370487e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 20:59:48,626] [INFO] [timer.py:260:stop] epoch=1/micro_step=1181/global_step=3510, RunningAvgSamplesPerSec=8.218516739570196, CurrSamplesPerSec=8.202884442613673, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3520, Loss: 0.1480383276939392 +[2024-01-21 21:00:27,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=3520, skipped=0, lr=[1.0107196186136631e-05, 1.0107196186136631e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:00:27,603] [INFO] [timer.py:260:stop] epoch=1/micro_step=1191/global_step=3520, RunningAvgSamplesPerSec=8.218498395063705, CurrSamplesPerSec=8.212318772343414, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3530, Loss: 0.1183416023850441 +[2024-01-21 21:01:06,578] [INFO] [logging.py:96:log_dist] [Rank 0] step=3530, skipped=0, lr=[1.0061581577986564e-05, 1.0061581577986564e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:01:06,578] [INFO] [timer.py:260:stop] epoch=1/micro_step=1201/global_step=3530, RunningAvgSamplesPerSec=8.21848193664738, CurrSamplesPerSec=8.244755566072065, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3540, Loss: 0.06962604075670242 +[2024-01-21 21:01:45,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=3540, skipped=0, lr=[1.0015965688421979e-05, 1.0015965688421979e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:01:45,448] [INFO] [timer.py:260:stop] epoch=1/micro_step=1211/global_step=3540, RunningAvgSamplesPerSec=8.218528080590302, CurrSamplesPerSec=8.236456454691021, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3550, Loss: 0.025137219578027725 +[2024-01-21 21:02:24,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=3550, skipped=0, lr=[9.970349466636857e-06, 9.970349466636857e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:02:24,429] [INFO] [timer.py:260:stop] epoch=1/micro_step=1221/global_step=3550, RunningAvgSamplesPerSec=8.218508110175925, CurrSamplesPerSec=8.250781784335839, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3560, Loss: 0.10791552811861038 +[2024-01-21 21:03:03,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=3560, skipped=0, lr=[9.92473386183209e-06, 9.92473386183209e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:03:03,423] [INFO] [timer.py:260:stop] epoch=1/micro_step=1231/global_step=3560, RunningAvgSamplesPerSec=8.21847996693029, CurrSamplesPerSec=8.219051031973152, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3570, Loss: 0.1464160978794098 +[2024-01-21 21:03:42,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=3570, skipped=0, lr=[9.879119823195735e-06, 9.879119823195735e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:03:42,364] [INFO] [timer.py:260:stop] epoch=1/micro_step=1241/global_step=3570, RunningAvgSamplesPerSec=8.218483747988879, CurrSamplesPerSec=8.21972300252665, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3580, Loss: 0.026708880439400673 +[2024-01-21 21:04:21,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=3580, skipped=0, lr=[9.83350829988325e-06, 9.83350829988325e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:04:21,297] [INFO] [timer.py:260:stop] epoch=1/micro_step=1251/global_step=3580, RunningAvgSamplesPerSec=8.21849197222666, CurrSamplesPerSec=8.170878597999117, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3590, Loss: 0.09169787168502808 +[2024-01-21 21:05:00,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=3590, skipped=0, lr=[9.787900240997768e-06, 9.787900240997768e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:05:00,190] [INFO] [timer.py:260:stop] epoch=1/micro_step=1261/global_step=3590, RunningAvgSamplesPerSec=8.218524096269087, CurrSamplesPerSec=8.232706270444869, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3600, Loss: 0.08692147582769394 +[2024-01-21 21:05:39,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=0, lr=[9.742296595570316e-06, 9.742296595570316e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:05:39,133] [INFO] [timer.py:260:stop] epoch=1/micro_step=1271/global_step=3600, RunningAvgSamplesPerSec=8.218526632504076, CurrSamplesPerSec=8.197316948682474, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3610, Loss: 0.02320829965174198 +[2024-01-21 21:06:18,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=3610, skipped=0, lr=[9.6966983125401e-06, 9.6966983125401e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:06:18,083] [INFO] [timer.py:260:stop] epoch=1/micro_step=1281/global_step=3610, RunningAvgSamplesPerSec=8.218524781290109, CurrSamplesPerSec=8.229153240622686, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3620, Loss: 0.04932890832424164 +[2024-01-21 21:06:57,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=3620, skipped=0, lr=[9.651106340734729e-06, 9.651106340734729e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:06:57,034] [INFO] [timer.py:260:stop] epoch=1/micro_step=1291/global_step=3620, RunningAvgSamplesPerSec=8.218522785498452, CurrSamplesPerSec=8.229656303329346, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3630, Loss: 0.12001072615385056 +[2024-01-21 21:07:35,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=3630, skipped=0, lr=[9.605521628850496e-06, 9.605521628850496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:07:35,966] [INFO] [timer.py:260:stop] epoch=1/micro_step=1301/global_step=3630, RunningAvgSamplesPerSec=8.21853128864398, CurrSamplesPerSec=8.23201854326992, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3640, Loss: 0.06495397537946701 +[2024-01-21 21:08:14,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=3640, skipped=0, lr=[9.55994512543262e-06, 9.55994512543262e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:08:14,938] [INFO] [timer.py:260:stop] epoch=1/micro_step=1311/global_step=3640, RunningAvgSamplesPerSec=8.218517168541101, CurrSamplesPerSec=8.246293467340635, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3650, Loss: 0.12584149837493896 +[2024-01-21 21:08:53,839] [INFO] [logging.py:96:log_dist] [Rank 0] step=3650, skipped=0, lr=[9.514377778855521e-06, 9.514377778855521e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:08:53,840] [INFO] [timer.py:260:stop] epoch=1/micro_step=1321/global_step=3650, RunningAvgSamplesPerSec=8.218543083911644, CurrSamplesPerSec=8.244712517143757, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3660, Loss: 0.1423175185918808 +[2024-01-21 21:09:32,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=3660, skipped=0, lr=[9.468820537303071e-06, 9.468820537303071e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:09:32,737] [INFO] [timer.py:260:stop] epoch=1/micro_step=1331/global_step=3660, RunningAvgSamplesPerSec=8.218572250210599, CurrSamplesPerSec=8.22634993151028, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3670, Loss: 0.09550794959068298 +[2024-01-21 21:10:11,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=3670, skipped=0, lr=[9.42327434874888e-06, 9.42327434874888e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:10:11,664] [INFO] [timer.py:260:stop] epoch=1/micro_step=1341/global_step=3670, RunningAvgSamplesPerSec=8.21858291137238, CurrSamplesPerSec=8.25040850110124, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3680, Loss: 0.0491308867931366 +[2024-01-21 21:10:50,578] [INFO] [logging.py:96:log_dist] [Rank 0] step=3680, skipped=0, lr=[9.377740160936564e-06, 9.377740160936564e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:10:50,579] [INFO] [timer.py:260:stop] epoch=1/micro_step=1351/global_step=3680, RunningAvgSamplesPerSec=8.218601602881005, CurrSamplesPerSec=8.220190174887476, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3690, Loss: 0.042253948748111725 +[2024-01-21 21:11:29,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=3690, skipped=0, lr=[9.332218921360013e-06, 9.332218921360013e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:11:29,530] [INFO] [timer.py:260:stop] epoch=1/micro_step=1361/global_step=3690, RunningAvgSamplesPerSec=8.218598804836052, CurrSamplesPerSec=8.209810136822087, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3700, Loss: 0.11322741210460663 +[2024-01-21 21:12:08,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=0, lr=[9.2867115772437e-06, 9.2867115772437e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:12:08,478] [INFO] [timer.py:260:stop] epoch=1/micro_step=1371/global_step=3700, RunningAvgSamplesPerSec=8.21859838195715, CurrSamplesPerSec=8.224062499521297, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3710, Loss: 0.16779431700706482 +[2024-01-21 21:12:47,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=3710, skipped=0, lr=[9.241219075522934e-06, 9.241219075522934e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:12:47,402] [INFO] [timer.py:260:stop] epoch=1/micro_step=1381/global_step=3710, RunningAvgSamplesPerSec=8.218611011325557, CurrSamplesPerSec=8.1861851304597, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3720, Loss: 0.14394733309745789 +[2024-01-21 21:13:26,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=3720, skipped=0, lr=[9.1957423628242e-06, 9.1957423628242e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:13:26,346] [INFO] [timer.py:260:stop] epoch=1/micro_step=1391/global_step=3720, RunningAvgSamplesPerSec=8.218613136016012, CurrSamplesPerSec=8.20092671673989, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3730, Loss: 0.06924354285001755 +[2024-01-21 21:14:05,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=3730, skipped=0, lr=[9.150282385445423e-06, 9.150282385445423e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:14:05,285] [INFO] [timer.py:260:stop] epoch=1/micro_step=1401/global_step=3730, RunningAvgSamplesPerSec=8.218617343457217, CurrSamplesPerSec=8.242514074444635, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3740, Loss: 0.0929742157459259 +[2024-01-21 21:14:44,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=3740, skipped=0, lr=[9.104840089336305e-06, 9.104840089336305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:14:44,238] [INFO] [timer.py:260:stop] epoch=1/micro_step=1411/global_step=3740, RunningAvgSamplesPerSec=8.218614145708997, CurrSamplesPerSec=8.203972472034078, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3750, Loss: 0.148799866437912 +[2024-01-21 21:15:23,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=3750, skipped=0, lr=[9.059416420078611e-06, 9.059416420078611e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:15:23,196] [INFO] [timer.py:260:stop] epoch=1/micro_step=1421/global_step=3750, RunningAvgSamplesPerSec=8.218607345979354, CurrSamplesPerSec=8.225883067049926, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3760, Loss: 0.10878393799066544 +[2024-01-21 21:16:02,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=3760, skipped=0, lr=[9.014012322866532e-06, 9.014012322866532e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:16:02,138] [INFO] [timer.py:260:stop] epoch=1/micro_step=1431/global_step=3760, RunningAvgSamplesPerSec=8.218609863703394, CurrSamplesPerSec=8.208225071849176, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3770, Loss: 0.03315019980072975 +[2024-01-21 21:16:41,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=3770, skipped=0, lr=[8.968628742486982e-06, 8.968628742486982e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:16:41,115] [INFO] [timer.py:260:stop] epoch=1/micro_step=1441/global_step=3770, RunningAvgSamplesPerSec=8.218593264897766, CurrSamplesPerSec=8.242283766577422, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3780, Loss: 0.03259813413023949 +[2024-01-21 21:17:20,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=3780, skipped=0, lr=[8.923266623299958e-06, 8.923266623299958e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:17:20,137] [INFO] [timer.py:260:stop] epoch=1/micro_step=1451/global_step=3780, RunningAvgSamplesPerSec=8.218550802528707, CurrSamplesPerSec=8.218745032357493, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3790, Loss: 0.1351025253534317 +[2024-01-21 21:17:59,073] [INFO] [logging.py:96:log_dist] [Rank 0] step=3790, skipped=0, lr=[8.87792690921888e-06, 8.87792690921888e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:17:59,074] [INFO] [timer.py:260:stop] epoch=1/micro_step=1461/global_step=3790, RunningAvgSamplesPerSec=8.218556750809729, CurrSamplesPerSec=8.24311141636211, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3800, Loss: 0.021946584805846214 +[2024-01-21 21:18:37,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=0, lr=[8.832610543690957e-06, 8.832610543690957e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:18:37,986] [INFO] [timer.py:260:stop] epoch=1/micro_step=1471/global_step=3800, RunningAvgSamplesPerSec=8.218576096881275, CurrSamplesPerSec=8.235790336619544, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3810, Loss: 0.13361869752407074 +[2024-01-21 21:19:16,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=3810, skipped=0, lr=[8.78731846967755e-06, 8.78731846967755e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:19:16,950] [INFO] [timer.py:260:stop] epoch=1/micro_step=1481/global_step=3810, RunningAvgSamplesPerSec=8.218566239366288, CurrSamplesPerSec=8.187768187194266, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3820, Loss: 0.04840990900993347 +[2024-01-21 21:19:55,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=3820, skipped=0, lr=[8.742051629634553e-06, 8.742051629634553e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:19:55,859] [INFO] [timer.py:260:stop] epoch=1/micro_step=1491/global_step=3820, RunningAvgSamplesPerSec=8.218587121676329, CurrSamplesPerSec=8.234801971502145, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3830, Loss: 0.042287249118089676 +[2024-01-21 21:20:34,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=3830, skipped=0, lr=[8.696810965492782e-06, 8.696810965492782e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:20:34,833] [INFO] [timer.py:260:stop] epoch=1/micro_step=1501/global_step=3830, RunningAvgSamplesPerSec=8.218572346895774, CurrSamplesPerSec=8.201164240412199, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3840, Loss: 0.06623029708862305 +[2024-01-21 21:21:13,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=3840, skipped=0, lr=[8.65159741863837e-06, 8.65159741863837e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:21:13,734] [INFO] [timer.py:260:stop] epoch=1/micro_step=1511/global_step=3840, RunningAvgSamplesPerSec=8.218597459900497, CurrSamplesPerSec=8.232508322506336, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3850, Loss: 0.0912681370973587 +[2024-01-21 21:21:52,770] [INFO] [logging.py:96:log_dist] [Rank 0] step=3850, skipped=0, lr=[8.606411929893188e-06, 8.606411929893188e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:21:52,770] [INFO] [timer.py:260:stop] epoch=1/micro_step=1521/global_step=3850, RunningAvgSamplesPerSec=8.218548567363358, CurrSamplesPerSec=8.209268826299953, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3860, Loss: 0.09013911336660385 +[2024-01-21 21:22:31,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=3860, skipped=0, lr=[8.561255439495265e-06, 8.561255439495265e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:22:31,746] [INFO] [timer.py:260:stop] epoch=1/micro_step=1531/global_step=3860, RunningAvgSamplesPerSec=8.218532825609664, CurrSamplesPerSec=8.207563010256326, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3870, Loss: 0.03211949020624161 +[2024-01-21 21:23:10,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=3870, skipped=0, lr=[8.516128887079204e-06, 8.516128887079204e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:23:10,680] [INFO] [timer.py:260:stop] epoch=1/micro_step=1541/global_step=3870, RunningAvgSamplesPerSec=8.218540374885485, CurrSamplesPerSec=8.211244603364538, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3880, Loss: 0.021962566301226616 +[2024-01-21 21:23:49,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=3880, skipped=0, lr=[8.47103321165667e-06, 8.47103321165667e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:23:49,578] [INFO] [timer.py:260:stop] epoch=1/micro_step=1551/global_step=3880, RunningAvgSamplesPerSec=8.218567153196025, CurrSamplesPerSec=8.228139228946178, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3890, Loss: 0.21556846797466278 +[2024-01-21 21:24:28,486] [INFO] [logging.py:96:log_dist] [Rank 0] step=3890, skipped=0, lr=[8.425969351596804e-06, 8.425969351596804e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:24:28,486] [INFO] [timer.py:260:stop] epoch=1/micro_step=1561/global_step=3890, RunningAvgSamplesPerSec=8.218588135072991, CurrSamplesPerSec=8.235917689255198, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3900, Loss: 0.18136905133724213 +[2024-01-21 21:25:07,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=0, lr=[8.380938244606742e-06, 8.380938244606742e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:25:07,456] [INFO] [timer.py:260:stop] epoch=1/micro_step=1571/global_step=3900, RunningAvgSamplesPerSec=8.218575905321991, CurrSamplesPerSec=8.184516841901143, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3910, Loss: 0.046589698642492294 +[2024-01-21 21:25:46,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=3910, skipped=0, lr=[8.33594082771206e-06, 8.33594082771206e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:25:46,442] [INFO] [timer.py:260:stop] epoch=1/micro_step=1581/global_step=3910, RunningAvgSamplesPerSec=8.218554593464921, CurrSamplesPerSec=8.197320953878448, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3920, Loss: 0.03936820104718208 +[2024-01-21 21:26:25,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=3920, skipped=0, lr=[8.290978037237316e-06, 8.290978037237316e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:26:25,336] [INFO] [timer.py:260:stop] epoch=1/micro_step=1591/global_step=3920, RunningAvgSamplesPerSec=8.21858321686424, CurrSamplesPerSec=8.231739344694127, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3930, Loss: 0.031540267169475555 +[2024-01-21 21:27:04,242] [INFO] [logging.py:96:log_dist] [Rank 0] step=3930, skipped=0, lr=[8.246050808786527e-06, 8.246050808786527e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:27:04,243] [INFO] [timer.py:260:stop] epoch=1/micro_step=1601/global_step=3930, RunningAvgSamplesPerSec=8.21860481521701, CurrSamplesPerSec=8.20913426296329, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3940, Loss: 0.10934942960739136 +[2024-01-21 21:27:43,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=3940, skipped=0, lr=[8.201160077223737e-06, 8.201160077223737e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:27:43,237] [INFO] [timer.py:260:stop] epoch=1/micro_step=1611/global_step=3940, RunningAvgSamplesPerSec=8.21857909344574, CurrSamplesPerSec=8.193920950339981, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3950, Loss: 0.10080189257860184 +[2024-01-21 21:28:22,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=3950, skipped=0, lr=[8.15630677665355e-06, 8.15630677665355e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:28:22,172] [INFO] [timer.py:260:stop] epoch=1/micro_step=1621/global_step=3950, RunningAvgSamplesPerSec=8.218585381431698, CurrSamplesPerSec=8.226366570206565, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3960, Loss: 0.02887783758342266 +[2024-01-21 21:29:01,089] [INFO] [logging.py:96:log_dist] [Rank 0] step=3960, skipped=0, lr=[8.111491840401673e-06, 8.111491840401673e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:29:01,090] [INFO] [timer.py:260:stop] epoch=1/micro_step=1631/global_step=3960, RunningAvgSamplesPerSec=8.218601068477195, CurrSamplesPerSec=8.223684071859868, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3970, Loss: 0.03933325037360191 +[2024-01-21 21:29:40,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=3970, skipped=0, lr=[8.06671620099553e-06, 8.06671620099553e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:29:40,015] [INFO] [timer.py:260:stop] epoch=1/micro_step=1641/global_step=3970, RunningAvgSamplesPerSec=8.218612725309832, CurrSamplesPerSec=8.190951134309095, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3980, Loss: 0.033879924565553665 +[2024-01-21 21:30:18,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=3980, skipped=0, lr=[8.021980790144828e-06, 8.021980790144828e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:30:18,947] [INFO] [timer.py:260:stop] epoch=1/micro_step=1651/global_step=3980, RunningAvgSamplesPerSec=8.21862038848853, CurrSamplesPerSec=8.19096763011742, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 3990, Loss: 0.04866497591137886 +[2024-01-21 21:30:57,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=3990, skipped=0, lr=[7.977286538722193e-06, 7.977286538722193e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:30:57,883] [INFO] [timer.py:260:stop] epoch=1/micro_step=1661/global_step=3990, RunningAvgSamplesPerSec=8.218625991267963, CurrSamplesPerSec=8.191294560752855, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4000, Loss: 0.12630395591259003 +[2024-01-21 21:31:36,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=0, lr=[7.932634376743776e-06, 7.932634376743776e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:31:36,801] [INFO] [timer.py:260:stop] epoch=1/micro_step=1671/global_step=4000, RunningAvgSamplesPerSec=8.218641075792945, CurrSamplesPerSec=8.243654161368424, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4010, Loss: 0.0491059273481369 +[2024-01-21 21:32:15,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=4010, skipped=0, lr=[7.88802523334993e-06, 7.88802523334993e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:32:15,766] [INFO] [timer.py:260:stop] epoch=1/micro_step=1681/global_step=4010, RunningAvgSamplesPerSec=8.218631195693334, CurrSamplesPerSec=8.198852727615476, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4020, Loss: 0.07056733220815659 +[2024-01-21 21:32:54,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=4020, skipped=0, lr=[7.84346003678584e-06, 7.84346003678584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:32:54,736] [INFO] [timer.py:260:stop] epoch=1/micro_step=1691/global_step=4020, RunningAvgSamplesPerSec=8.218618750827442, CurrSamplesPerSec=8.229778924725803, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4030, Loss: 0.029223904013633728 +[2024-01-21 21:33:33,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=4030, skipped=0, lr=[7.798939714382245e-06, 7.798939714382245e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:33:33,653] [INFO] [timer.py:260:stop] epoch=1/micro_step=1701/global_step=4030, RunningAvgSamplesPerSec=8.218634398510519, CurrSamplesPerSec=8.21919548378733, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4040, Loss: 0.04605334997177124 +[2024-01-21 21:34:12,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=4040, skipped=0, lr=[7.754465192536121e-06, 7.754465192536121e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:34:12,593] [INFO] [timer.py:260:stop] epoch=1/micro_step=1711/global_step=4040, RunningAvgSamplesPerSec=8.21863777092022, CurrSamplesPerSec=8.220710268650647, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4050, Loss: 0.028828049078583717 +[2024-01-21 21:34:51,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=4050, skipped=0, lr=[7.710037396691393e-06, 7.710037396691393e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:34:51,530] [INFO] [timer.py:260:stop] epoch=1/micro_step=1721/global_step=4050, RunningAvgSamplesPerSec=8.218642970746249, CurrSamplesPerSec=8.210447448209441, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4060, Loss: 0.07710961252450943 +[2024-01-21 21:35:30,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=4060, skipped=0, lr=[7.665657251319713e-06, 7.665657251319713e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:35:30,502] [INFO] [timer.py:260:stop] epoch=1/micro_step=1731/global_step=4060, RunningAvgSamplesPerSec=8.218629337794937, CurrSamplesPerSec=8.18760236201259, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4070, Loss: 0.04236265644431114 +[2024-01-21 21:36:09,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=4070, skipped=0, lr=[7.621325679901186e-06, 7.621325679901186e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:36:09,487] [INFO] [timer.py:260:stop] epoch=1/micro_step=1741/global_step=4070, RunningAvgSamplesPerSec=8.218609511672975, CurrSamplesPerSec=8.243312405946831, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4080, Loss: 0.19449593126773834 +[2024-01-21 21:36:48,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=4080, skipped=0, lr=[7.577043604905184e-06, 7.577043604905184e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:36:48,415] [INFO] [timer.py:260:stop] epoch=1/micro_step=1751/global_step=4080, RunningAvgSamplesPerSec=8.218619290801936, CurrSamplesPerSec=8.226074646486396, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4090, Loss: 0.0350918173789978 +[2024-01-21 21:37:27,385] [INFO] [logging.py:96:log_dist] [Rank 0] step=4090, skipped=0, lr=[7.532811947771121e-06, 7.532811947771121e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:37:27,385] [INFO] [timer.py:260:stop] epoch=1/micro_step=1761/global_step=4090, RunningAvgSamplesPerSec=8.218607195044743, CurrSamplesPerSec=8.241693123694484, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4100, Loss: 0.06254701316356659 +[2024-01-21 21:38:06,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=0, lr=[7.4886316288893165e-06, 7.4886316288893165e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:38:06,357] [INFO] [timer.py:260:stop] epoch=1/micro_step=1771/global_step=4100, RunningAvgSamplesPerSec=8.21859428534018, CurrSamplesPerSec=8.25112618836333, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4110, Loss: 0.019799401983618736 +[2024-01-21 21:38:45,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=4110, skipped=0, lr=[7.4445035675818e-06, 7.4445035675818e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:38:45,260] [INFO] [timer.py:260:stop] epoch=1/micro_step=1781/global_step=4110, RunningAvgSamplesPerSec=8.21861675414249, CurrSamplesPerSec=8.225328041219752, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4120, Loss: 0.019992681220173836 +[2024-01-21 21:39:24,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=4120, skipped=0, lr=[7.4004286820832235e-06, 7.4004286820832235e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:39:24,147] [INFO] [timer.py:260:stop] epoch=1/micro_step=1791/global_step=4120, RunningAvgSamplesPerSec=8.218647166063938, CurrSamplesPerSec=8.249794381673222, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4130, Loss: 0.017380189150571823 +[2024-01-21 21:40:03,144] [INFO] [logging.py:96:log_dist] [Rank 0] step=4130, skipped=0, lr=[7.356407889521725e-06, 7.356407889521725e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:40:03,145] [INFO] [timer.py:260:stop] epoch=1/micro_step=1801/global_step=4130, RunningAvgSamplesPerSec=8.218620728350647, CurrSamplesPerSec=8.206248238748026, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4140, Loss: 0.134999617934227 +[2024-01-21 21:40:42,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=4140, skipped=0, lr=[7.312442105899855e-06, 7.312442105899855e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:40:42,152] [INFO] [timer.py:260:stop] epoch=1/micro_step=1811/global_step=4140, RunningAvgSamplesPerSec=8.218589497717248, CurrSamplesPerSec=8.18616316173978, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4150, Loss: 0.08562242239713669 +[2024-01-21 21:41:21,075] [INFO] [logging.py:96:log_dist] [Rank 0] step=4150, skipped=0, lr=[7.26853224607552e-06, 7.26853224607552e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:41:21,075] [INFO] [timer.py:260:stop] epoch=1/micro_step=1821/global_step=4150, RunningAvgSamplesPerSec=8.218601407791791, CurrSamplesPerSec=8.226243042096804, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4160, Loss: 0.08173200488090515 +[2024-01-21 21:42:00,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=4160, skipped=0, lr=[7.224679223742938e-06, 7.224679223742938e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:42:00,019] [INFO] [timer.py:260:stop] epoch=1/micro_step=1831/global_step=4160, RunningAvgSamplesPerSec=8.218603114696448, CurrSamplesPerSec=8.175022259070609, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4170, Loss: 0.10926807671785355 +[2024-01-21 21:42:38,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=4170, skipped=0, lr=[7.180883951413628e-06, 7.180883951413628e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:42:38,926] [INFO] [timer.py:260:stop] epoch=1/micro_step=1841/global_step=4170, RunningAvgSamplesPerSec=8.218623000828954, CurrSamplesPerSec=8.213192683954276, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4180, Loss: 0.07666481286287308 +[2024-01-21 21:43:17,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=4180, skipped=0, lr=[7.137147340397428e-06, 7.137147340397428e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:43:17,858] [INFO] [timer.py:260:stop] epoch=1/micro_step=1851/global_step=4180, RunningAvgSamplesPerSec=8.21863020815357, CurrSamplesPerSec=8.206805208835299, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4190, Loss: 0.045099444687366486 +[2024-01-21 21:43:56,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=4190, skipped=0, lr=[7.093470300783525e-06, 7.093470300783525e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:43:56,756] [INFO] [timer.py:260:stop] epoch=1/micro_step=1861/global_step=4190, RunningAvgSamplesPerSec=8.218654441853273, CurrSamplesPerSec=8.248825972822663, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4200, Loss: 0.14392530918121338 +[2024-01-21 21:44:35,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=4200, skipped=0, lr=[7.04985374142152e-06, 7.04985374142152e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:44:35,702] [INFO] [timer.py:260:stop] epoch=1/micro_step=1871/global_step=4200, RunningAvgSamplesPerSec=8.218654956507846, CurrSamplesPerSec=8.216645422730055, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4210, Loss: 0.07495556771755219 +[2024-01-21 21:45:14,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=4210, skipped=0, lr=[7.006298569902516e-06, 7.006298569902516e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:45:14,655] [INFO] [timer.py:260:stop] epoch=1/micro_step=1881/global_step=4210, RunningAvgSamplesPerSec=8.21865137913999, CurrSamplesPerSec=8.21838017772992, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4220, Loss: 0.038911838084459305 +[2024-01-21 21:45:53,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=4220, skipped=0, lr=[6.962805692540233e-06, 6.962805692540233e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:45:53,571] [INFO] [timer.py:260:stop] epoch=1/micro_step=1891/global_step=4220, RunningAvgSamplesPerSec=8.21866668702607, CurrSamplesPerSec=8.235593251018958, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4230, Loss: 0.06450259685516357 +[2024-01-21 21:46:32,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=4230, skipped=0, lr=[6.919376014352147e-06, 6.919376014352147e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:46:32,511] [INFO] [timer.py:260:stop] epoch=1/micro_step=1901/global_step=4230, RunningAvgSamplesPerSec=8.218670106025623, CurrSamplesPerSec=8.214348301730324, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4240, Loss: 0.05204075202345848 +[2024-01-21 21:47:11,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=4240, skipped=0, lr=[6.8760104390406705e-06, 6.8760104390406705e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:47:11,496] [INFO] [timer.py:260:stop] epoch=1/micro_step=1911/global_step=4240, RunningAvgSamplesPerSec=8.218650814814056, CurrSamplesPerSec=8.199998802542861, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4250, Loss: 0.21581096947193146 +[2024-01-21 21:47:50,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=4250, skipped=0, lr=[6.832709868974318e-06, 6.832709868974318e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:47:50,415] [INFO] [timer.py:260:stop] epoch=1/micro_step=1921/global_step=4250, RunningAvgSamplesPerSec=8.21866426647411, CurrSamplesPerSec=8.225292756006306, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4260, Loss: 0.1295495182275772 +[2024-01-21 21:48:29,348] [INFO] [logging.py:96:log_dist] [Rank 0] step=4260, skipped=0, lr=[6.789475205168968e-06, 6.789475205168968e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:48:29,349] [INFO] [timer.py:260:stop] epoch=1/micro_step=1931/global_step=4260, RunningAvgSamplesPerSec=8.218670289625999, CurrSamplesPerSec=8.221327619264866, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4270, Loss: 0.08829659968614578 +[2024-01-21 21:49:08,268] [INFO] [logging.py:96:log_dist] [Rank 0] step=4270, skipped=0, lr=[6.746307347269078e-06, 6.746307347269078e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:49:08,269] [INFO] [timer.py:260:stop] epoch=1/micro_step=1941/global_step=4270, RunningAvgSamplesPerSec=8.218683405933202, CurrSamplesPerSec=8.226436150938474, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4280, Loss: 0.06158251315355301 +[2024-01-21 21:49:47,214] [INFO] [logging.py:96:log_dist] [Rank 0] step=4280, skipped=0, lr=[6.703207193529e-06, 6.703207193529e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:49:47,215] [INFO] [timer.py:260:stop] epoch=1/micro_step=1951/global_step=4280, RunningAvgSamplesPerSec=8.218683319059718, CurrSamplesPerSec=8.236374068544336, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4290, Loss: 0.03813033550977707 +[2024-01-21 21:50:26,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=4290, skipped=0, lr=[6.660175640794247e-06, 6.660175640794247e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:50:26,181] [INFO] [timer.py:260:stop] epoch=1/micro_step=1961/global_step=4290, RunningAvgSamplesPerSec=8.218673265822318, CurrSamplesPerSec=8.20136669761021, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4300, Loss: 0.147399440407753 +[2024-01-21 21:51:05,090] [INFO] [logging.py:96:log_dist] [Rank 0] step=4300, skipped=0, lr=[6.617213584482877e-06, 6.617213584482877e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:51:05,090] [INFO] [timer.py:260:stop] epoch=1/micro_step=1971/global_step=4300, RunningAvgSamplesPerSec=8.218691497085457, CurrSamplesPerSec=8.216329039735946, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4310, Loss: 0.021013891324400902 +[2024-01-21 21:51:44,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=4310, skipped=0, lr=[6.574321918566819e-06, 6.574321918566819e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:51:44,009] [INFO] [timer.py:260:stop] epoch=1/micro_step=1981/global_step=4310, RunningAvgSamplesPerSec=8.218705176432374, CurrSamplesPerSec=8.218485856427321, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4320, Loss: 0.04652582108974457 +[2024-01-21 21:52:22,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=4320, skipped=0, lr=[6.531501535553303e-06, 6.531501535553303e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:52:22,955] [INFO] [timer.py:260:stop] epoch=1/micro_step=1991/global_step=4320, RunningAvgSamplesPerSec=8.21870483131959, CurrSamplesPerSec=8.196817831770016, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4330, Loss: 0.037658847868442535 +[2024-01-21 21:53:01,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=4330, skipped=0, lr=[6.488753326466276e-06, 6.488753326466276e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:53:01,871] [INFO] [timer.py:260:stop] epoch=1/micro_step=2001/global_step=4330, RunningAvgSamplesPerSec=8.218719310777418, CurrSamplesPerSec=8.245629303233757, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4340, Loss: 0.04884570091962814 +[2024-01-21 21:53:40,806] [INFO] [logging.py:96:log_dist] [Rank 0] step=4340, skipped=0, lr=[6.446078180827847e-06, 6.446078180827847e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:53:40,807] [INFO] [timer.py:260:stop] epoch=1/micro_step=2011/global_step=4340, RunningAvgSamplesPerSec=8.21872448221297, CurrSamplesPerSec=8.203204804707308, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4350, Loss: 0.0841117724776268 +[2024-01-21 21:54:19,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=4350, skipped=0, lr=[6.40347698663981e-06, 6.40347698663981e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:54:19,749] [INFO] [timer.py:260:stop] epoch=1/micro_step=2021/global_step=4350, RunningAvgSamplesPerSec=8.218726539306244, CurrSamplesPerSec=8.221696764699594, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4360, Loss: 0.014991851523518562 +[2024-01-21 21:54:58,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=4360, skipped=0, lr=[6.360950630365126e-06, 6.360950630365126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:54:58,743] [INFO] [timer.py:260:stop] epoch=1/micro_step=2031/global_step=4360, RunningAvgSamplesPerSec=8.21870317546448, CurrSamplesPerSec=8.21548211838403, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4370, Loss: 0.14284050464630127 +[2024-01-21 21:55:37,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=4370, skipped=0, lr=[6.318499996909519e-06, 6.318499996909519e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:55:37,726] [INFO] [timer.py:260:stop] epoch=1/micro_step=2041/global_step=4370, RunningAvgSamplesPerSec=8.218685006314265, CurrSamplesPerSec=8.213458562852962, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4380, Loss: 0.02137073315680027 +[2024-01-21 21:56:16,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=4380, skipped=0, lr=[6.276125969603024e-06, 6.276125969603024e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:56:16,663] [INFO] [timer.py:260:stop] epoch=1/micro_step=2051/global_step=4380, RunningAvgSamplesPerSec=8.218689713252438, CurrSamplesPerSec=8.199991287896353, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4390, Loss: 0.040297091007232666 +[2024-01-21 21:56:55,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=4390, skipped=0, lr=[6.23382943018164e-06, 6.23382943018164e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:56:55,588] [INFO] [timer.py:260:stop] epoch=1/micro_step=2061/global_step=4390, RunningAvgSamplesPerSec=8.218700020667272, CurrSamplesPerSec=8.227592472636907, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4400, Loss: 0.06972300261259079 +[2024-01-21 21:57:34,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=4400, skipped=0, lr=[6.191611258768953e-06, 6.191611258768953e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:57:34,540] [INFO] [timer.py:260:stop] epoch=1/micro_step=2071/global_step=4400, RunningAvgSamplesPerSec=8.218696760936295, CurrSamplesPerSec=8.21193439053836, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4410, Loss: 0.054737720638513565 +[2024-01-21 21:58:13,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=4410, skipped=0, lr=[6.149472333857841e-06, 6.149472333857841e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:58:13,458] [INFO] [timer.py:260:stop] epoch=1/micro_step=2081/global_step=4410, RunningAvgSamplesPerSec=8.218710216329432, CurrSamplesPerSec=8.254704821654581, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4420, Loss: 0.15915793180465698 +[2024-01-21 21:58:52,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=4420, skipped=0, lr=[6.1074135322921964e-06, 6.1074135322921964e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:58:52,468] [INFO] [timer.py:260:stop] epoch=1/micro_step=2091/global_step=4420, RunningAvgSamplesPerSec=8.218679773734221, CurrSamplesPerSec=8.202699456181522, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4430, Loss: 0.11707805842161179 +[2024-01-21 21:59:31,403] [INFO] [logging.py:96:log_dist] [Rank 0] step=4430, skipped=0, lr=[6.0654357292486566e-06, 6.0654357292486566e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 21:59:31,403] [INFO] [timer.py:260:stop] epoch=1/micro_step=2101/global_step=4430, RunningAvgSamplesPerSec=8.218685211503363, CurrSamplesPerSec=8.224538732951043, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4440, Loss: 0.05405707284808159 +[2024-01-21 22:00:10,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=4440, skipped=0, lr=[6.023539798218424e-06, 6.023539798218424e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:00:10,299] [INFO] [timer.py:260:stop] epoch=1/micro_step=2111/global_step=4440, RunningAvgSamplesPerSec=8.218709376653146, CurrSamplesPerSec=8.237994800560061, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4450, Loss: 0.06787370890378952 +[2024-01-21 22:00:49,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=4450, skipped=0, lr=[5.981726610989061e-06, 5.981726610989061e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:00:49,275] [INFO] [timer.py:260:stop] epoch=1/micro_step=2121/global_step=4450, RunningAvgSamplesPerSec=8.218695160493672, CurrSamplesPerSec=8.210970328359338, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4460, Loss: 0.04252857714891434 +[2024-01-21 22:01:28,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=4460, skipped=0, lr=[5.939997037626379e-06, 5.939997037626379e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:01:28,177] [INFO] [timer.py:260:stop] epoch=1/micro_step=2131/global_step=4460, RunningAvgSamplesPerSec=8.218715731033397, CurrSamplesPerSec=8.22758742909777, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4470, Loss: 0.16866199672222137 +[2024-01-21 22:02:07,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=4470, skipped=0, lr=[5.898351946456301e-06, 5.898351946456301e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:02:07,099] [INFO] [timer.py:260:stop] epoch=1/micro_step=2141/global_step=4470, RunningAvgSamplesPerSec=8.218727540604727, CurrSamplesPerSec=8.225810470858093, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4480, Loss: 0.02378750964999199 +[2024-01-21 22:02:46,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=4480, skipped=0, lr=[5.856792204046826e-06, 5.856792204046826e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:02:46,119] [INFO] [timer.py:260:stop] epoch=1/micro_step=2151/global_step=4480, RunningAvgSamplesPerSec=8.218692322625918, CurrSamplesPerSec=8.181757322726657, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4490, Loss: 0.04645959287881851 +[2024-01-21 22:03:25,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=4490, skipped=0, lr=[5.815318675189969e-06, 5.815318675189969e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:03:25,043] [INFO] [timer.py:260:stop] epoch=1/micro_step=2161/global_step=4490, RunningAvgSamplesPerSec=8.218702811150667, CurrSamplesPerSec=8.25901425870154, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4500, Loss: 0.07580987364053726 +[2024-01-21 22:04:03,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=4500, skipped=0, lr=[5.7739322228837816e-06, 5.7739322228837816e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:04:03,951] [INFO] [timer.py:260:stop] epoch=1/micro_step=2171/global_step=4500, RunningAvgSamplesPerSec=8.21872112541879, CurrSamplesPerSec=8.226783567999636, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4510, Loss: 0.0184792410582304 +[2024-01-21 22:04:42,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=4510, skipped=0, lr=[5.732633708314403e-06, 5.732633708314403e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:04:42,861] [INFO] [timer.py:260:stop] epoch=1/micro_step=2181/global_step=4510, RunningAvgSamplesPerSec=8.218737981781597, CurrSamplesPerSec=8.228463584600608, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4520, Loss: 0.0374906025826931 +[2024-01-21 22:05:21,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=4520, skipped=0, lr=[5.691423990838103e-06, 5.691423990838103e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:05:21,809] [INFO] [timer.py:260:stop] epoch=1/micro_step=2191/global_step=4520, RunningAvgSamplesPerSec=8.21873694562359, CurrSamplesPerSec=8.220234981923996, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4530, Loss: 0.10021474957466125 +[2024-01-21 22:06:00,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=4530, skipped=0, lr=[5.650303927963459e-06, 5.650303927963459e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:06:00,741] [INFO] [timer.py:260:stop] epoch=1/micro_step=2201/global_step=4530, RunningAvgSamplesPerSec=8.218743505542093, CurrSamplesPerSec=8.23051422513505, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4540, Loss: 0.07934443652629852 +[2024-01-21 22:06:39,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=4540, skipped=0, lr=[5.60927437533344e-06, 5.60927437533344e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:06:39,678] [INFO] [timer.py:260:stop] epoch=1/micro_step=2211/global_step=4540, RunningAvgSamplesPerSec=8.218747429378912, CurrSamplesPerSec=8.219394805428468, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4550, Loss: 0.03781230375170708 +[2024-01-21 22:07:18,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=4550, skipped=0, lr=[5.568336186707679e-06, 5.568336186707679e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:07:18,619] [INFO] [timer.py:260:stop] epoch=1/micro_step=2221/global_step=4550, RunningAvgSamplesPerSec=8.218750064508326, CurrSamplesPerSec=8.214510687307051, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4560, Loss: 0.037532299757003784 +[2024-01-21 22:07:57,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=4560, skipped=0, lr=[5.527490213944637e-06, 5.527490213944637e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:07:57,562] [INFO] [timer.py:260:stop] epoch=1/micro_step=2231/global_step=4560, RunningAvgSamplesPerSec=8.218751446461578, CurrSamplesPerSec=8.187617845404885, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4570, Loss: 0.02890683524310589 +[2024-01-21 22:08:36,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=4570, skipped=0, lr=[5.486737306983942e-06, 5.486737306983942e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:08:36,505] [INFO] [timer.py:260:stop] epoch=1/micro_step=2241/global_step=4570, RunningAvgSamplesPerSec=8.218752770252799, CurrSamplesPerSec=8.235990463958219, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4580, Loss: 0.06574537605047226 +[2024-01-21 22:09:15,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=4580, skipped=0, lr=[5.446078313828635e-06, 5.446078313828635e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:09:15,441] [INFO] [timer.py:260:stop] epoch=1/micro_step=2251/global_step=4580, RunningAvgSamplesPerSec=8.218756928149707, CurrSamplesPerSec=8.230180623697839, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4590, Loss: 0.06829094886779785 +[2024-01-21 22:09:54,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=4590, skipped=0, lr=[5.405514080527594e-06, 5.405514080527594e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:09:54,370] [INFO] [timer.py:260:stop] epoch=1/micro_step=2261/global_step=4590, RunningAvgSamplesPerSec=8.218764805501326, CurrSamplesPerSec=8.205318618367729, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4600, Loss: 0.15948641300201416 +[2024-01-21 22:10:33,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=4600, skipped=0, lr=[5.365045451157874e-06, 5.365045451157874e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:10:33,336] [INFO] [timer.py:260:stop] epoch=1/micro_step=2271/global_step=4600, RunningAvgSamplesPerSec=8.21875547593687, CurrSamplesPerSec=8.242159759979724, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4610, Loss: 0.12652692198753357 +[2024-01-21 22:11:12,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=4610, skipped=0, lr=[5.324673267807173e-06, 5.324673267807173e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:11:12,258] [INFO] [timer.py:260:stop] epoch=1/micro_step=2281/global_step=4610, RunningAvgSamplesPerSec=8.21876667652389, CurrSamplesPerSec=8.218550774688982, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4620, Loss: 0.11215459555387497 +[2024-01-21 22:11:51,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=4620, skipped=0, lr=[5.284398370556299e-06, 5.284398370556299e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:11:51,192] [INFO] [timer.py:260:stop] epoch=1/micro_step=2291/global_step=4620, RunningAvgSamplesPerSec=8.218772084903812, CurrSamplesPerSec=8.189807586909412, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4630, Loss: 0.0334763303399086 +[2024-01-21 22:12:30,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=4630, skipped=0, lr=[5.2442215974616906e-06, 5.2442215974616906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:12:30,178] [INFO] [timer.py:260:stop] epoch=1/micro_step=2301/global_step=4630, RunningAvgSamplesPerSec=8.21875368517606, CurrSamplesPerSec=8.16968495233235, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4640, Loss: 0.03742726147174835 +[2024-01-21 22:13:09,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=4640, skipped=0, lr=[5.2041437845379806e-06, 5.2041437845379806e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:13:09,123] [INFO] [timer.py:260:stop] epoch=1/micro_step=2311/global_step=4640, RunningAvgSamplesPerSec=8.218754044461367, CurrSamplesPerSec=8.224958065608112, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 1, Total Step: 4650, Loss: 0.018095742911100388 +[2024-01-21 22:13:48,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=4650, skipped=0, lr=[5.164165765740597e-06, 5.164165765740597e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:13:48,103] [INFO] [timer.py:260:stop] epoch=1/micro_step=2321/global_step=4650, RunningAvgSamplesPerSec=8.218738598019604, CurrSamplesPerSec=8.209978369380899, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +***** Evaluating perplexity, Epoch 2/3 ***** +ppl: 1.0082670450210571 +eval loss: 0.008233007043600082 +Beginning of Epoch 3/3, Total Micro Batches 2329 +Epoch: 2, Total Step: 4659, Loss: 0.046429116278886795 +[2024-01-21 22:14:24,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=4660, skipped=0, lr=[5.1242883729484134e-06, 5.1242883729484134e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:14:24,713] [INFO] [timer.py:260:stop] epoch=2/micro_step=2/global_step=4660, RunningAvgSamplesPerSec=8.219965823883438, CurrSamplesPerSec=8.218567885099267, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4669, Loss: 0.018754543736577034 +[2024-01-21 22:15:03,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=4670, skipped=0, lr=[5.084512435946433e-06, 5.084512435946433e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:15:03,598] [INFO] [timer.py:260:stop] epoch=2/micro_step=12/global_step=4670, RunningAvgSamplesPerSec=8.219990846267988, CurrSamplesPerSec=8.233055732388157, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4679, Loss: 0.08063953369855881 +[2024-01-21 22:15:42,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=4680, skipped=0, lr=[5.044838782408528e-06, 5.044838782408528e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:15:42,499] [INFO] [timer.py:260:stop] epoch=2/micro_step=22/global_step=4680, RunningAvgSamplesPerSec=8.220008609053188, CurrSamplesPerSec=8.241431992390392, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4689, Loss: 0.028717122972011566 +[2024-01-21 22:16:21,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=4690, skipped=0, lr=[5.005268237880213e-06, 5.005268237880213e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:16:21,445] [INFO] [timer.py:260:stop] epoch=2/micro_step=32/global_step=4690, RunningAvgSamplesPerSec=8.220005714096283, CurrSamplesPerSec=8.203707708351848, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4699, Loss: 0.061068590730428696 +[2024-01-21 22:17:00,416] [INFO] [logging.py:96:log_dist] [Rank 0] step=4700, skipped=0, lr=[4.965801625761472e-06, 4.965801625761472e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:17:00,416] [INFO] [timer.py:260:stop] epoch=2/micro_step=42/global_step=4700, RunningAvgSamplesPerSec=8.219991578804533, CurrSamplesPerSec=8.194231107481109, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4709, Loss: 0.029893064871430397 +[2024-01-21 22:17:39,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=4710, skipped=0, lr=[4.9264397672896166e-06, 4.9264397672896166e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:17:39,410] [INFO] [timer.py:260:stop] epoch=2/micro_step=52/global_step=4710, RunningAvgSamplesPerSec=8.219967569502616, CurrSamplesPerSec=8.183401534581431, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4719, Loss: 0.10614433884620667 +[2024-01-21 22:18:18,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=4720, skipped=0, lr=[4.887183481522206e-06, 4.887183481522206e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:18:18,447] [INFO] [timer.py:260:stop] epoch=2/micro_step=62/global_step=4720, RunningAvgSamplesPerSec=8.219924206668532, CurrSamplesPerSec=8.176374361716327, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4729, Loss: 0.09477872401475906 +[2024-01-21 22:18:57,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=4730, skipped=0, lr=[4.8480335853199965e-06, 4.8480335853199965e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:18:57,464] [INFO] [timer.py:260:stop] epoch=2/micro_step=72/global_step=4730, RunningAvgSamplesPerSec=8.219889760941552, CurrSamplesPerSec=8.182258597643699, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4739, Loss: 0.01677810028195381 +[2024-01-21 22:19:36,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=4740, skipped=0, lr=[4.808990893329948e-06, 4.808990893329948e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:19:36,427] [INFO] [timer.py:260:stop] epoch=2/micro_step=82/global_step=4740, RunningAvgSamplesPerSec=8.219879893490697, CurrSamplesPerSec=8.244091143281068, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4749, Loss: 0.04384884238243103 +[2024-01-21 22:20:15,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=4750, skipped=0, lr=[4.770056217968273e-06, 4.770056217968273e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:20:15,368] [INFO] [timer.py:260:stop] epoch=2/micro_step=92/global_step=4750, RunningAvgSamplesPerSec=8.21987946338454, CurrSamplesPerSec=8.195432437584637, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4759, Loss: 0.01988648623228073 +[2024-01-21 22:20:54,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=4760, skipped=0, lr=[4.731230369403527e-06, 4.731230369403527e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:20:54,318] [INFO] [timer.py:260:stop] epoch=2/micro_step=102/global_step=4760, RunningAvgSamplesPerSec=8.219875779116995, CurrSamplesPerSec=8.225348708413929, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4769, Loss: 0.03036932274699211 +[2024-01-21 22:21:33,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=4770, skipped=0, lr=[4.692514155539758e-06, 4.692514155539758e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:21:33,249] [INFO] [timer.py:260:stop] epoch=2/micro_step=112/global_step=4770, RunningAvgSamplesPerSec=8.219879907755912, CurrSamplesPerSec=8.211830387529456, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4779, Loss: 0.07104656100273132 +[2024-01-21 22:22:12,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=4780, skipped=0, lr=[4.653908381999685e-06, 4.653908381999685e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:22:12,208] [INFO] [timer.py:260:stop] epoch=2/micro_step=122/global_step=4780, RunningAvgSamplesPerSec=8.219871871138523, CurrSamplesPerSec=8.217341651711182, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4789, Loss: 0.013853972777724266 +[2024-01-21 22:22:51,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=4790, skipped=0, lr=[4.61541385210794e-06, 4.61541385210794e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:22:51,132] [INFO] [timer.py:260:stop] epoch=2/micro_step=132/global_step=4790, RunningAvgSamplesPerSec=8.219878831248547, CurrSamplesPerSec=8.199639617844628, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4799, Loss: 0.01997406780719757 +[2024-01-21 22:23:30,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=4800, skipped=0, lr=[4.577031366874365e-06, 4.577031366874365e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:23:30,139] [INFO] [timer.py:260:stop] epoch=2/micro_step=142/global_step=4800, RunningAvgSamplesPerSec=8.21984955467693, CurrSamplesPerSec=8.150428041347697, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4809, Loss: 0.08179482817649841 +[2024-01-21 22:24:09,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=4810, skipped=0, lr=[4.538761724977307e-06, 4.538761724977307e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:24:09,149] [INFO] [timer.py:260:stop] epoch=2/micro_step=152/global_step=4810, RunningAvgSamplesPerSec=8.219819349631551, CurrSamplesPerSec=8.19873603428696, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4819, Loss: 0.12035196274518967 +[2024-01-21 22:24:48,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=4820, skipped=0, lr=[4.50060572274705e-06, 4.50060572274705e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:24:48,076] [INFO] [timer.py:260:stop] epoch=2/micro_step=162/global_step=4820, RunningAvgSamplesPerSec=8.219825019753976, CurrSamplesPerSec=8.235022766646354, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4829, Loss: 0.028960634022951126 +[2024-01-21 22:25:27,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=4830, skipped=0, lr=[4.46256415414919e-06, 4.46256415414919e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:25:27,107] [INFO] [timer.py:260:stop] epoch=2/micro_step=172/global_step=4830, RunningAvgSamplesPerSec=8.2197855847502, CurrSamplesPerSec=8.216513132441023, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4839, Loss: 0.028713103383779526 +[2024-01-21 22:26:06,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=4840, skipped=0, lr=[4.424637810768172e-06, 4.424637810768172e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:26:06,087] [INFO] [timer.py:260:stop] epoch=2/micro_step=182/global_step=4840, RunningAvgSamplesPerSec=8.21976866559073, CurrSamplesPerSec=8.225237812490942, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4849, Loss: 0.027361340820789337 +[2024-01-21 22:26:45,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=4850, skipped=0, lr=[4.3868274817907545e-06, 4.3868274817907545e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:26:45,049] [INFO] [timer.py:260:stop] epoch=2/micro_step=192/global_step=4850, RunningAvgSamplesPerSec=8.21975955816052, CurrSamplesPerSec=8.213985848106866, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4859, Loss: 0.03962152823805809 +[2024-01-21 22:27:24,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=4860, skipped=0, lr=[4.349133953989654e-06, 4.349133953989654e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:27:24,031] [INFO] [timer.py:260:stop] epoch=2/micro_step=202/global_step=4860, RunningAvgSamplesPerSec=8.219741813891057, CurrSamplesPerSec=8.226032296579532, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4869, Loss: 0.0842229351401329 +[2024-01-21 22:28:02,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=4870, skipped=0, lr=[4.311558011707109e-06, 4.311558011707109e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:28:02,915] [INFO] [timer.py:260:stop] epoch=2/micro_step=212/global_step=4870, RunningAvgSamplesPerSec=8.219766634603646, CurrSamplesPerSec=8.245115168799199, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4879, Loss: 0.02138122357428074 +[2024-01-21 22:28:41,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=4880, skipped=0, lr=[4.274100436838618e-06, 4.274100436838618e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:28:41,881] [INFO] [timer.py:260:stop] epoch=2/micro_step=222/global_step=4880, RunningAvgSamplesPerSec=8.21975609037977, CurrSamplesPerSec=8.222864852459507, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4889, Loss: 0.03885550796985626 +[2024-01-21 22:29:20,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=4890, skipped=0, lr=[4.236762008816629e-06, 4.236762008816629e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:29:20,846] [INFO] [timer.py:260:stop] epoch=2/micro_step=232/global_step=4890, RunningAvgSamplesPerSec=8.21974562753367, CurrSamplesPerSec=8.215320700100156, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4899, Loss: 0.05702659860253334 +[2024-01-21 22:29:59,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=4900, skipped=0, lr=[4.199543504594332e-06, 4.199543504594332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:29:59,854] [INFO] [timer.py:260:stop] epoch=2/micro_step=242/global_step=4900, RunningAvgSamplesPerSec=8.219716907690383, CurrSamplesPerSec=8.174908732682537, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4909, Loss: 0.025160742923617363 +[2024-01-21 22:30:38,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=4910, skipped=0, lr=[4.1624456986295e-06, 4.1624456986295e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:30:38,838] [INFO] [timer.py:260:stop] epoch=2/micro_step=252/global_step=4910, RunningAvgSamplesPerSec=8.219698243572967, CurrSamplesPerSec=8.181569796853827, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4919, Loss: 0.07898124307394028 +[2024-01-21 22:31:17,704] [INFO] [logging.py:96:log_dist] [Rank 0] step=4920, skipped=0, lr=[4.125469362868365e-06, 4.125469362868365e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:31:17,705] [INFO] [timer.py:260:stop] epoch=2/micro_step=262/global_step=4920, RunningAvgSamplesPerSec=8.219730401791375, CurrSamplesPerSec=8.209219619792078, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4929, Loss: 0.15534308552742004 +[2024-01-21 22:31:56,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=4930, skipped=0, lr=[4.0886152667295565e-06, 4.0886152667295565e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:31:56,672] [INFO] [timer.py:260:stop] epoch=2/micro_step=272/global_step=4930, RunningAvgSamplesPerSec=8.219719245695357, CurrSamplesPerSec=8.204597341963773, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4939, Loss: 0.086125947535038 +[2024-01-21 22:32:35,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=4940, skipped=0, lr=[4.051884177088095e-06, 4.051884177088095e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:32:35,590] [INFO] [timer.py:260:stop] epoch=2/micro_step=282/global_step=4940, RunningAvgSamplesPerSec=8.219729450264927, CurrSamplesPerSec=8.21900925765634, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4949, Loss: 0.05816653370857239 +[2024-01-21 22:33:14,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=4950, skipped=0, lr=[4.015276858259427e-06, 4.015276858259427e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:33:14,535] [INFO] [timer.py:260:stop] epoch=2/micro_step=292/global_step=4950, RunningAvgSamplesPerSec=8.219727779036084, CurrSamplesPerSec=8.212631328910316, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4959, Loss: 0.05850698798894882 +[2024-01-21 22:33:53,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=4960, skipped=0, lr=[3.9787940719835324e-06, 3.9787940719835324e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:33:53,455] [INFO] [timer.py:260:stop] epoch=2/micro_step=302/global_step=4960, RunningAvgSamplesPerSec=8.21973682901907, CurrSamplesPerSec=8.22799849782232, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4969, Loss: 0.03402804955840111 +[2024-01-21 22:34:32,403] [INFO] [logging.py:96:log_dist] [Rank 0] step=4970, skipped=0, lr=[3.942436577409058e-06, 3.942436577409058e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:34:32,404] [INFO] [timer.py:260:stop] epoch=2/micro_step=312/global_step=4970, RunningAvgSamplesPerSec=8.219733635233087, CurrSamplesPerSec=8.215177390117235, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4979, Loss: 0.024973131716251373 +[2024-01-21 22:35:11,384] [INFO] [logging.py:96:log_dist] [Rank 0] step=4980, skipped=0, lr=[3.906205131077546e-06, 3.906205131077546e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:35:11,385] [INFO] [timer.py:260:stop] epoch=2/micro_step=322/global_step=4980, RunningAvgSamplesPerSec=8.219716840100695, CurrSamplesPerSec=8.240580900181127, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4989, Loss: 0.07678724825382233 +[2024-01-21 22:35:50,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=4990, skipped=0, lr=[3.870100486907651e-06, 3.870100486907651e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:35:50,344] [INFO] [timer.py:260:stop] epoch=2/micro_step=332/global_step=4990, RunningAvgSamplesPerSec=8.219709413848069, CurrSamplesPerSec=8.205107940382787, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 4999, Loss: 0.04152326658368111 +[2024-01-21 22:36:29,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=5000, skipped=0, lr=[3.834123396179504e-06, 3.834123396179504e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:36:29,291] [INFO] [timer.py:260:stop] epoch=2/micro_step=342/global_step=5000, RunningAvgSamplesPerSec=8.219706764760257, CurrSamplesPerSec=8.173072333767083, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5009, Loss: 0.01830972544848919 +[2024-01-21 22:37:08,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=5010, skipped=0, lr=[3.79827460751903e-06, 3.79827460751903e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:37:08,298] [INFO] [timer.py:260:stop] epoch=2/micro_step=352/global_step=5010, RunningAvgSamplesPerSec=8.219679333485368, CurrSamplesPerSec=8.215017995062215, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5019, Loss: 0.010700651444494724 +[2024-01-21 22:37:47,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=5020, skipped=0, lr=[3.762554866882404e-06, 3.762554866882404e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:37:47,229] [INFO] [timer.py:260:stop] epoch=2/micro_step=362/global_step=5020, RunningAvgSamplesPerSec=8.219683676007266, CurrSamplesPerSec=8.226522372174001, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5029, Loss: 0.08092665672302246 +[2024-01-21 22:38:26,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=5030, skipped=0, lr=[3.7269649175405122e-06, 3.7269649175405122e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:38:26,109] [INFO] [timer.py:260:stop] epoch=2/micro_step=372/global_step=5030, RunningAvgSamplesPerSec=8.219709884775929, CurrSamplesPerSec=8.196570048058412, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5039, Loss: 0.026559187099337578 +[2024-01-21 22:39:05,042] [INFO] [logging.py:96:log_dist] [Rank 0] step=5040, skipped=0, lr=[3.691505500063496e-06, 3.691505500063496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:39:05,042] [INFO] [timer.py:260:stop] epoch=2/micro_step=382/global_step=5040, RunningAvgSamplesPerSec=8.219713102204501, CurrSamplesPerSec=8.193640328386055, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5049, Loss: 0.02366100437939167 +[2024-01-21 22:39:44,037] [INFO] [logging.py:96:log_dist] [Rank 0] step=5050, skipped=0, lr=[3.6561773523053302e-06, 3.6561773523053302e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:39:44,038] [INFO] [timer.py:260:stop] epoch=2/micro_step=392/global_step=5050, RunningAvgSamplesPerSec=8.219690447550398, CurrSamplesPerSec=8.223373192636238, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5059, Loss: 0.050799280405044556 +[2024-01-21 22:40:22,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=5060, skipped=0, lr=[3.6209812093884777e-06, 3.6209812093884777e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:40:22,978] [INFO] [timer.py:260:stop] epoch=2/micro_step=402/global_step=5060, RunningAvgSamplesPerSec=8.219690993105846, CurrSamplesPerSec=8.201336629077487, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5069, Loss: 0.041833654046058655 +[2024-01-21 22:41:01,942] [INFO] [logging.py:96:log_dist] [Rank 0] step=5070, skipped=0, lr=[3.585917803688603e-06, 3.585917803688603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:41:01,943] [INFO] [timer.py:260:stop] epoch=2/micro_step=412/global_step=5070, RunningAvgSamplesPerSec=8.219681323088778, CurrSamplesPerSec=8.209353181685872, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5079, Loss: 0.04752175137400627 +[2024-01-21 22:41:40,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=5080, skipped=0, lr=[3.5509878648192964e-06, 3.5509878648192964e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:41:40,904] [INFO] [timer.py:260:stop] epoch=2/micro_step=422/global_step=5080, RunningAvgSamplesPerSec=8.219673471560597, CurrSamplesPerSec=8.216121316616857, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5089, Loss: 0.06569486111402512 +[2024-01-21 22:42:19,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=5090, skipped=0, lr=[3.5161921196169434e-06, 3.5161921196169434e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:42:19,872] [INFO] [timer.py:260:stop] epoch=2/micro_step=432/global_step=5090, RunningAvgSamplesPerSec=8.219662166113306, CurrSamplesPerSec=8.20536476832321, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5099, Loss: 0.026724472641944885 +[2024-01-21 22:42:58,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=5100, skipped=0, lr=[3.481531292125546e-06, 3.481531292125546e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:42:58,778] [INFO] [timer.py:260:stop] epoch=2/micro_step=442/global_step=5100, RunningAvgSamplesPerSec=8.21967702505973, CurrSamplesPerSec=8.192224006125167, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5109, Loss: 0.016669342294335365 +[2024-01-21 22:43:37,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=5110, skipped=0, lr=[3.447006103581709e-06, 3.447006103581709e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:43:37,774] [INFO] [timer.py:260:stop] epoch=2/micro_step=452/global_step=5110, RunningAvgSamplesPerSec=8.219654543354038, CurrSamplesPerSec=8.207543938040372, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5119, Loss: 0.025021540001034737 +[2024-01-21 22:44:16,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=5120, skipped=0, lr=[3.412617272399584e-06, 3.412617272399584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:44:16,744] [INFO] [timer.py:260:stop] epoch=2/micro_step=462/global_step=5120, RunningAvgSamplesPerSec=8.219642787472802, CurrSamplesPerSec=8.242388542406983, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5129, Loss: 0.014915797859430313 +[2024-01-21 22:44:55,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=5130, skipped=0, lr=[3.3783655141559677e-06, 3.3783655141559677e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:44:55,695] [INFO] [timer.py:260:stop] epoch=2/micro_step=472/global_step=5130, RunningAvgSamplesPerSec=8.219639049476397, CurrSamplesPerSec=8.226267243230177, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5139, Loss: 0.06622394919395447 +[2024-01-21 22:45:34,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=5140, skipped=0, lr=[3.3442515415753583e-06, 3.3442515415753583e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:45:34,653] [INFO] [timer.py:260:stop] epoch=2/micro_step=482/global_step=5140, RunningAvgSamplesPerSec=8.21963198656812, CurrSamplesPerSec=8.205019659491422, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5149, Loss: 0.040831539779901505 +[2024-01-21 22:46:13,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=5150, skipped=0, lr=[3.3102760645151797e-06, 3.3102760645151797e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:46:13,671] [INFO] [timer.py:260:stop] epoch=2/micro_step=492/global_step=5150, RunningAvgSamplesPerSec=8.21960093620441, CurrSamplesPerSec=8.244446130244429, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5159, Loss: 0.019962387159466743 +[2024-01-21 22:46:52,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=5160, skipped=0, lr=[3.2764397899509735e-06, 3.2764397899509735e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:46:52,589] [INFO] [timer.py:260:stop] epoch=2/micro_step=502/global_step=5160, RunningAvgSamplesPerSec=8.21961072791464, CurrSamplesPerSec=8.24248471580295, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5169, Loss: 0.02283751405775547 +[2024-01-21 22:47:31,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=5170, skipped=0, lr=[3.242743421961698e-06, 3.242743421961698e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:47:31,505] [INFO] [timer.py:260:stop] epoch=2/micro_step=512/global_step=5170, RunningAvgSamplesPerSec=8.219621095781324, CurrSamplesPerSec=8.236310384676939, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5179, Loss: 0.02541237324476242 +[2024-01-21 22:48:10,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=5180, skipped=0, lr=[3.2091876617150806e-06, 3.2091876617150806e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:48:10,460] [INFO] [timer.py:260:stop] epoch=2/micro_step=522/global_step=5180, RunningAvgSamplesPerSec=8.219615888621993, CurrSamplesPerSec=8.253998694047956, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5189, Loss: 0.023689234629273415 +[2024-01-21 22:48:49,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=5190, skipped=0, lr=[3.1757732074530267e-06, 3.1757732074530267e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:48:49,446] [INFO] [timer.py:260:stop] epoch=2/micro_step=532/global_step=5190, RunningAvgSamplesPerSec=8.219597799299677, CurrSamplesPerSec=8.233844150727663, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5199, Loss: 0.034411169588565826 +[2024-01-21 22:49:28,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=5200, skipped=0, lr=[3.142500754477088e-06, 3.142500754477088e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:49:28,368] [INFO] [timer.py:260:stop] epoch=2/micro_step=542/global_step=5200, RunningAvgSamplesPerSec=8.21960588402189, CurrSamplesPerSec=8.243445054316021, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5209, Loss: 0.0378330759704113 +[2024-01-21 22:50:07,348] [INFO] [logging.py:96:log_dist] [Rank 0] step=5210, skipped=0, lr=[3.1093709951339957e-06, 3.1093709951339957e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:50:07,349] [INFO] [timer.py:260:stop] epoch=2/micro_step=552/global_step=5210, RunningAvgSamplesPerSec=8.219590147344803, CurrSamplesPerSec=8.226230941583516, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5219, Loss: 0.023936551064252853 +[2024-01-21 22:50:46,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=5220, skipped=0, lr=[3.0763846188012536e-06, 3.0763846188012536e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:50:46,297] [INFO] [timer.py:260:stop] epoch=2/micro_step=562/global_step=5220, RunningAvgSamplesPerSec=8.21958753364744, CurrSamplesPerSec=8.217684275903348, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5229, Loss: 0.026020409539341927 +[2024-01-21 22:51:25,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=5230, skipped=0, lr=[3.043542311872796e-06, 3.043542311872796e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:51:25,230] [INFO] [timer.py:260:stop] epoch=2/micro_step=572/global_step=5230, RunningAvgSamplesPerSec=8.219591131995516, CurrSamplesPerSec=8.186631520443381, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5239, Loss: 0.015758175402879715 +[2024-01-21 22:52:04,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=5240, skipped=0, lr=[3.0108447577446954e-06, 3.0108447577446954e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:52:04,171] [INFO] [timer.py:260:stop] epoch=2/micro_step=582/global_step=5240, RunningAvgSamplesPerSec=8.219591470239232, CurrSamplesPerSec=8.232730509622128, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5249, Loss: 0.07202973961830139 +[2024-01-21 22:52:43,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=5250, skipped=0, lr=[2.9782926368009644e-06, 2.9782926368009644e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:52:43,117] [INFO] [timer.py:260:stop] epoch=2/micro_step=592/global_step=5250, RunningAvgSamplesPerSec=8.21958966545786, CurrSamplesPerSec=8.221939018667829, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5259, Loss: 0.07467261701822281 +[2024-01-21 22:53:22,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=5260, skipped=0, lr=[2.9458866263993604e-06, 2.9458866263993604e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:53:22,099] [INFO] [timer.py:260:stop] epoch=2/micro_step=602/global_step=5260, RunningAvgSamplesPerSec=8.219573489069068, CurrSamplesPerSec=8.202014728066098, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5269, Loss: 0.023700254037976265 +[2024-01-21 22:54:01,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=5270, skipped=0, lr=[2.9136274008573373e-06, 2.9136274008573373e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:54:01,013] [INFO] [timer.py:260:stop] epoch=2/micro_step=612/global_step=5270, RunningAvgSamplesPerSec=8.219584918924403, CurrSamplesPerSec=8.233790103138217, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5279, Loss: 0.03472544252872467 +[2024-01-21 22:54:39,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=5280, skipped=0, lr=[2.8815156314379668e-06, 2.8815156314379668e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:54:39,913] [INFO] [timer.py:260:stop] epoch=2/micro_step=622/global_step=5280, RunningAvgSamplesPerSec=8.219601831558576, CurrSamplesPerSec=8.221612155450764, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5289, Loss: 0.01243289839476347 +[2024-01-21 22:55:18,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=5290, skipped=0, lr=[2.8495519863360166e-06, 2.8495519863360166e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:55:18,831] [INFO] [timer.py:260:stop] epoch=2/micro_step=632/global_step=5290, RunningAvgSamplesPerSec=8.21961134354304, CurrSamplesPerSec=8.236148652297324, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5299, Loss: 0.03455457091331482 +[2024-01-21 22:55:57,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=5300, skipped=0, lr=[2.817737130663999e-06, 2.817737130663999e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:55:57,765] [INFO] [timer.py:260:stop] epoch=2/micro_step=642/global_step=5300, RunningAvgSamplesPerSec=8.219614783108138, CurrSamplesPerSec=8.209592198027376, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5309, Loss: 0.061165884137153625 +[2024-01-21 22:56:36,743] [INFO] [logging.py:96:log_dist] [Rank 0] step=5310, skipped=0, lr=[2.7860717264383807e-06, 2.7860717264383807e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:56:36,743] [INFO] [timer.py:260:stop] epoch=2/micro_step=652/global_step=5310, RunningAvgSamplesPerSec=8.219600186903847, CurrSamplesPerSec=8.219198000414458, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5319, Loss: 0.02157147042453289 +[2024-01-21 22:57:15,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=5320, skipped=0, lr=[2.754556432565758e-06, 2.754556432565758e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:57:15,657] [INFO] [timer.py:260:stop] epoch=2/micro_step=662/global_step=5320, RunningAvgSamplesPerSec=8.219611485195832, CurrSamplesPerSec=8.203347697278511, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5329, Loss: 0.033548034727573395 +[2024-01-21 22:57:54,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=5330, skipped=0, lr=[2.723191904829192e-06, 2.723191904829192e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:57:54,566] [INFO] [timer.py:260:stop] epoch=2/micro_step=672/global_step=5330, RunningAvgSamplesPerSec=8.219624381431206, CurrSamplesPerSec=8.207117345848411, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5339, Loss: 0.013107041828334332 +[2024-01-21 22:58:33,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=5340, skipped=0, lr=[2.691978795874518e-06, 2.691978795874518e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:58:33,526] [INFO] [timer.py:260:stop] epoch=2/micro_step=682/global_step=5340, RunningAvgSamplesPerSec=8.219617189506163, CurrSamplesPerSec=8.194187083747389, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5349, Loss: 0.01637548953294754 +[2024-01-21 22:59:12,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=5350, skipped=0, lr=[2.66091775519681e-06, 2.66091775519681e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:59:12,493] [INFO] [timer.py:260:stop] epoch=2/micro_step=692/global_step=5350, RunningAvgSamplesPerSec=8.219607605593138, CurrSamplesPerSec=8.204252799178656, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5359, Loss: 0.08014064282178879 +[2024-01-21 22:59:51,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=5360, skipped=0, lr=[2.6300094291268297e-06, 2.6300094291268297e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 22:59:51,432] [INFO] [timer.py:260:stop] epoch=2/micro_step=702/global_step=5360, RunningAvgSamplesPerSec=8.219608675518424, CurrSamplesPerSec=8.20650865048645, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5369, Loss: 0.049721069633960724 +[2024-01-21 23:00:30,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=5370, skipped=0, lr=[2.599254460817593e-06, 2.599254460817593e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:00:30,349] [INFO] [timer.py:260:stop] epoch=2/micro_step=712/global_step=5370, RunningAvgSamplesPerSec=8.219618426859807, CurrSamplesPerSec=8.229392402183873, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5379, Loss: 0.025823500007390976 +[2024-01-21 23:01:09,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=5380, skipped=0, lr=[2.568653490230989e-06, 2.568653490230989e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:01:09,283] [INFO] [timer.py:260:stop] epoch=2/micro_step=722/global_step=5380, RunningAvgSamplesPerSec=8.21962169726351, CurrSamplesPerSec=8.22835310911887, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5389, Loss: 0.04489656165242195 +[2024-01-21 23:01:48,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=5390, skipped=0, lr=[2.538207154124456e-06, 2.538207154124456e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:01:48,258] [INFO] [timer.py:260:stop] epoch=2/micro_step=732/global_step=5390, RunningAvgSamplesPerSec=8.219608568501709, CurrSamplesPerSec=8.222918252932963, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5399, Loss: 0.05568582937121391 +[2024-01-21 23:02:27,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=5400, skipped=0, lr=[2.507916086037736e-06, 2.507916086037736e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:02:27,181] [INFO] [timer.py:260:stop] epoch=2/micro_step=742/global_step=5400, RunningAvgSamplesPerSec=8.219616033195829, CurrSamplesPerSec=8.246947603837754, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5409, Loss: 0.07828336954116821 +[2024-01-21 23:03:06,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=5410, skipped=0, lr=[2.477780916279693e-06, 2.477780916279693e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:03:06,168] [INFO] [timer.py:260:stop] epoch=2/micro_step=752/global_step=5410, RunningAvgSamplesPerSec=8.219598382282955, CurrSamplesPerSec=8.235778208002511, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5419, Loss: 0.0357724092900753 +[2024-01-21 23:03:45,103] [INFO] [logging.py:96:log_dist] [Rank 0] step=5420, skipped=0, lr=[2.4478022719151915e-06, 2.4478022719151915e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:03:45,103] [INFO] [timer.py:260:stop] epoch=2/micro_step=762/global_step=5420, RunningAvgSamplesPerSec=8.219600799341999, CurrSamplesPerSec=8.220145368339423, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5429, Loss: 0.035199038684368134 +[2024-01-21 23:04:24,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=5430, skipped=0, lr=[2.417980776752057e-06, 2.417980776752057e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:04:24,092] [INFO] [timer.py:260:stop] epoch=2/micro_step=772/global_step=5430, RunningAvgSamplesPerSec=8.219582650401822, CurrSamplesPerSec=8.215956855640954, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5439, Loss: 0.032465286552906036 +[2024-01-21 23:05:03,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=5440, skipped=0, lr=[2.388317051328084e-06, 2.388317051328084e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:05:03,046] [INFO] [timer.py:260:stop] epoch=2/micro_step=782/global_step=5440, RunningAvgSamplesPerSec=8.219577744435936, CurrSamplesPerSec=8.226850634468747, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5449, Loss: 0.10486235469579697 +[2024-01-21 23:05:42,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=5450, skipped=0, lr=[2.3588117128981356e-06, 2.3588117128981356e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:05:42,054] [INFO] [timer.py:260:stop] epoch=2/micro_step=792/global_step=5450, RunningAvgSamplesPerSec=8.21955212702025, CurrSamplesPerSec=8.110095656880226, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5459, Loss: 0.041757963597774506 +[2024-01-21 23:06:20,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=5460, skipped=0, lr=[2.3294653754212915e-06, 2.3294653754212915e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:06:20,976] [INFO] [timer.py:260:stop] epoch=2/micro_step=802/global_step=5460, RunningAvgSamplesPerSec=8.219560184580477, CurrSamplesPerSec=8.199535425103502, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5469, Loss: 0.08660506457090378 +[2024-01-21 23:06:59,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=5470, skipped=0, lr=[2.3002786495480754e-06, 2.3002786495480754e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:06:59,895] [INFO] [timer.py:260:stop] epoch=2/micro_step=812/global_step=5470, RunningAvgSamplesPerSec=8.219568980997472, CurrSamplesPerSec=8.233376435275346, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5479, Loss: 0.03184046968817711 +[2024-01-21 23:07:38,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=5480, skipped=0, lr=[2.2712521426077483e-06, 2.2712521426077483e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:07:38,856] [INFO] [timer.py:260:stop] epoch=2/micro_step=822/global_step=5480, RunningAvgSamplesPerSec=8.2195618093675, CurrSamplesPerSec=8.233228454303275, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5489, Loss: 0.01756538450717926 +[2024-01-21 23:08:17,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=5490, skipped=0, lr=[2.24238645859567e-06, 2.24238645859567e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:08:17,798] [INFO] [timer.py:260:stop] epoch=2/micro_step=832/global_step=5490, RunningAvgSamplesPerSec=8.219561640252026, CurrSamplesPerSec=8.200233266432667, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5499, Loss: 0.023386148735880852 +[2024-01-21 23:08:56,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=5500, skipped=0, lr=[2.2136821981607305e-06, 2.2136821981607305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:08:56,801] [INFO] [timer.py:260:stop] epoch=2/micro_step=842/global_step=5500, RunningAvgSamplesPerSec=8.21953828786125, CurrSamplesPerSec=8.218347468207707, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5509, Loss: 0.020102746784687042 +[2024-01-21 23:09:35,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=5510, skipped=0, lr=[2.1851399585928536e-06, 2.1851399585928536e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:09:35,771] [INFO] [timer.py:260:stop] epoch=2/micro_step=852/global_step=5510, RunningAvgSamplesPerSec=8.219527571718002, CurrSamplesPerSec=8.200588995798823, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5519, Loss: 0.026628950610756874 +[2024-01-21 23:10:14,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=5520, skipped=0, lr=[2.1567603338105667e-06, 2.1567603338105667e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:10:14,729] [INFO] [timer.py:260:stop] epoch=2/micro_step=862/global_step=5520, RunningAvgSamplesPerSec=8.219521428102688, CurrSamplesPerSec=8.193519781898022, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5529, Loss: 0.03135382756590843 +[2024-01-21 23:10:53,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=5530, skipped=0, lr=[2.1285439143486408e-06, 2.1285439143486408e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:10:53,661] [INFO] [timer.py:260:stop] epoch=2/micro_step=872/global_step=5530, RunningAvgSamplesPerSec=8.21952534233696, CurrSamplesPerSec=8.196778285486126, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5539, Loss: 0.053377971053123474 +[2024-01-21 23:11:32,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=5540, skipped=0, lr=[2.100491287345813e-06, 2.100491287345813e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:11:32,608] [INFO] [timer.py:260:stop] epoch=2/micro_step=882/global_step=5540, RunningAvgSamplesPerSec=8.219523767404139, CurrSamplesPerSec=8.249144356508216, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5549, Loss: 0.0809902623295784 +[2024-01-21 23:12:11,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=5550, skipped=0, lr=[2.0726030365325434e-06, 2.0726030365325434e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:12:11,599] [INFO] [timer.py:260:stop] epoch=2/micro_step=892/global_step=5550, RunningAvgSamplesPerSec=8.21950497126161, CurrSamplesPerSec=8.227058396185601, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5559, Loss: 0.06669999659061432 +[2024-01-21 23:12:50,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=5560, skipped=0, lr=[2.044879742218906e-06, 2.044879742218906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:12:50,637] [INFO] [timer.py:260:stop] epoch=2/micro_step=902/global_step=5560, RunningAvgSamplesPerSec=8.219468733291503, CurrSamplesPerSec=8.231170908481653, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5569, Loss: 0.054657239466905594 +[2024-01-21 23:13:29,568] [INFO] [logging.py:96:log_dist] [Rank 0] step=5570, skipped=0, lr=[2.017321981282471e-06, 2.017321981282471e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:13:29,569] [INFO] [timer.py:260:stop] epoch=2/micro_step=912/global_step=5570, RunningAvgSamplesPerSec=8.219472775829667, CurrSamplesPerSec=8.227928386263333, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5579, Loss: 0.05893351882696152 +[2024-01-21 23:14:08,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=5580, skipped=0, lr=[1.9899303271563443e-06, 1.9899303271563443e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:14:08,500] [INFO] [timer.py:260:stop] epoch=2/micro_step=922/global_step=5580, RunningAvgSamplesPerSec=8.219476996720475, CurrSamplesPerSec=8.207881228199167, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5589, Loss: 0.028212716802954674 +[2024-01-21 23:14:47,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=5590, skipped=0, lr=[1.9627053498171946e-06, 1.9627053498171946e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:14:47,468] [INFO] [timer.py:260:stop] epoch=2/micro_step=932/global_step=5590, RunningAvgSamplesPerSec=8.219467142855319, CurrSamplesPerSec=8.21335954733844, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5599, Loss: 0.015249170362949371 +[2024-01-21 23:15:26,436] [INFO] [logging.py:96:log_dist] [Rank 0] step=5600, skipped=0, lr=[1.9356476157734315e-06, 1.9356476157734315e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:15:26,436] [INFO] [timer.py:260:stop] epoch=2/micro_step=942/global_step=5600, RunningAvgSamplesPerSec=8.219457462421584, CurrSamplesPerSec=8.223441211203136, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5609, Loss: 0.02434239163994789 +[2024-01-21 23:16:05,408] [INFO] [logging.py:96:log_dist] [Rank 0] step=5610, skipped=0, lr=[1.9087576880533763e-06, 1.9087576880533763e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:16:05,409] [INFO] [timer.py:260:stop] epoch=2/micro_step=952/global_step=5610, RunningAvgSamplesPerSec=8.219446335098345, CurrSamplesPerSec=8.211006495443096, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5619, Loss: 0.05370941385626793 +[2024-01-21 23:16:44,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=5620, skipped=0, lr=[1.8820361261935882e-06, 1.8820361261935882e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:16:44,328] [INFO] [timer.py:260:stop] epoch=2/micro_step=962/global_step=5620, RunningAvgSamplesPerSec=8.219455000416904, CurrSamplesPerSec=8.232667387062815, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5629, Loss: 0.0338558629155159 +[2024-01-21 23:17:23,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=5630, skipped=0, lr=[1.8554834862271887e-06, 1.8554834862271887e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:17:23,304] [INFO] [timer.py:260:stop] epoch=2/micro_step=972/global_step=5630, RunningAvgSamplesPerSec=8.21944262347945, CurrSamplesPerSec=8.213146445902952, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5639, Loss: 0.10875668376684189 +[2024-01-21 23:18:02,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=5640, skipped=0, lr=[1.829100320672309e-06, 1.829100320672309e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:18:02,272] [INFO] [timer.py:260:stop] epoch=2/micro_step=982/global_step=5640, RunningAvgSamplesPerSec=8.219433391942854, CurrSamplesPerSec=8.228894416874681, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5649, Loss: 0.07654879242181778 +[2024-01-21 23:18:41,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=5650, skipped=0, lr=[1.802887178520586e-06, 1.802887178520586e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:18:41,293] [INFO] [timer.py:260:stop] epoch=2/micro_step=992/global_step=5650, RunningAvgSamplesPerSec=8.219404185763356, CurrSamplesPerSec=8.19167251309228, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5659, Loss: 0.10442313551902771 +[2024-01-21 23:19:20,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=5660, skipped=0, lr=[1.7768446052257404e-06, 1.7768446052257404e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:19:20,271] [INFO] [timer.py:260:stop] epoch=2/micro_step=1002/global_step=5660, RunningAvgSamplesPerSec=8.219390847756747, CurrSamplesPerSec=8.223549035533294, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5669, Loss: 0.018104571849107742 +[2024-01-21 23:19:59,247] [INFO] [logging.py:96:log_dist] [Rank 0] step=5670, skipped=0, lr=[1.7509731426922284e-06, 1.7509731426922284e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:19:59,247] [INFO] [timer.py:260:stop] epoch=2/micro_step=1012/global_step=5670, RunningAvgSamplesPerSec=8.219378486381014, CurrSamplesPerSec=8.200694217441221, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5679, Loss: 0.01581454835832119 +[2024-01-21 23:20:38,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=5680, skipped=0, lr=[1.7252733292639623e-06, 1.7252733292639623e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:20:38,166] [INFO] [timer.py:260:stop] epoch=2/micro_step=1022/global_step=5680, RunningAvgSamplesPerSec=8.219387236751162, CurrSamplesPerSec=8.22542885778347, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5689, Loss: 0.08269765228033066 +[2024-01-21 23:21:17,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=5690, skipped=0, lr=[1.6997456997131101e-06, 1.6997456997131101e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:21:17,133] [INFO] [timer.py:260:stop] epoch=2/micro_step=1032/global_step=5690, RunningAvgSamplesPerSec=8.21937855296478, CurrSamplesPerSec=8.190782181067316, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5699, Loss: 0.039170339703559875 +[2024-01-21 23:21:56,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=5700, skipped=0, lr=[1.6743907852289686e-06, 1.6743907852289686e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:21:56,031] [INFO] [timer.py:260:stop] epoch=2/micro_step=1042/global_step=5700, RunningAvgSamplesPerSec=8.219395183603284, CurrSamplesPerSec=8.247701686972977, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5709, Loss: 0.0378580316901207 +[2024-01-21 23:22:34,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=5710, skipped=0, lr=[1.6492091134069078e-06, 1.6492091134069078e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:22:34,988] [INFO] [timer.py:260:stop] epoch=2/micro_step=1052/global_step=5710, RunningAvgSamplesPerSec=8.219389670612344, CurrSamplesPerSec=8.250568258133681, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5719, Loss: 0.02568187564611435 +[2024-01-21 23:23:13,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=5720, skipped=0, lr=[1.624201208237397e-06, 1.624201208237397e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:23:13,945] [INFO] [timer.py:260:stop] epoch=2/micro_step=1062/global_step=5720, RunningAvgSamplesPerSec=8.219384453498032, CurrSamplesPerSec=8.205458073079821, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5729, Loss: 0.027237065136432648 +[2024-01-21 23:23:52,967] [INFO] [logging.py:96:log_dist] [Rank 0] step=5730, skipped=0, lr=[1.5993675900950945e-06, 1.5993675900950945e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:23:52,968] [INFO] [timer.py:260:stop] epoch=2/micro_step=1072/global_step=5730, RunningAvgSamplesPerSec=8.219355130539872, CurrSamplesPerSec=8.204905800010502, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5739, Loss: 0.035896386951208115 +[2024-01-21 23:24:31,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=5740, skipped=0, lr=[1.5747087757280243e-06, 1.5747087757280243e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:24:31,972] [INFO] [timer.py:260:stop] epoch=2/micro_step=1082/global_step=5740, RunningAvgSamplesPerSec=8.219332572386874, CurrSamplesPerSec=8.204875705521525, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5749, Loss: 0.016952909529209137 +[2024-01-21 23:25:10,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=5750, skipped=0, lr=[1.5502252782468252e-06, 1.5502252782468252e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:25:10,911] [INFO] [timer.py:260:stop] epoch=2/micro_step=1092/global_step=5750, RunningAvgSamplesPerSec=8.219333816239804, CurrSamplesPerSec=8.224498414756551, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5759, Loss: 0.04264436289668083 +[2024-01-21 23:25:49,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=5760, skipped=0, lr=[1.525917607114068e-06, 1.525917607114068e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:25:49,859] [INFO] [timer.py:260:stop] epoch=2/micro_step=1102/global_step=5760, RunningAvgSamplesPerSec=8.219331891483, CurrSamplesPerSec=8.212668515680361, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5769, Loss: 0.024373987689614296 +[2024-01-21 23:26:28,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=5770, skipped=0, lr=[1.5017862681336581e-06, 1.5017862681336581e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:26:28,801] [INFO] [timer.py:260:stop] epoch=2/micro_step=1112/global_step=5770, RunningAvgSamplesPerSec=8.219332386527448, CurrSamplesPerSec=8.188769274622121, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5779, Loss: 0.06863710284233093 +[2024-01-21 23:27:07,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=5780, skipped=0, lr=[1.4778317634403082e-06, 1.4778317634403082e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:27:07,729] [INFO] [timer.py:260:stop] epoch=2/micro_step=1122/global_step=5780, RunningAvgSamplesPerSec=8.21933782338119, CurrSamplesPerSec=8.22900440236503, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5789, Loss: 0.07015746831893921 +[2024-01-21 23:27:46,704] [INFO] [logging.py:96:log_dist] [Rank 0] step=5790, skipped=0, lr=[1.4540545914890958e-06, 1.4540545914890958e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:27:46,705] [INFO] [timer.py:260:stop] epoch=2/micro_step=1132/global_step=5790, RunningAvgSamplesPerSec=8.219326010503599, CurrSamplesPerSec=8.231106800265273, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5799, Loss: 0.05854431539773941 +[2024-01-21 23:28:25,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=5800, skipped=0, lr=[1.4304552470450817e-06, 1.4304552470450817e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:28:25,620] [INFO] [timer.py:260:stop] epoch=2/micro_step=1142/global_step=5800, RunningAvgSamplesPerSec=8.219336195211925, CurrSamplesPerSec=8.219140118380402, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5809, Loss: 0.034306976944208145 +[2024-01-21 23:29:04,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=5810, skipped=0, lr=[1.4070342211730215e-06, 1.4070342211730215e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:29:04,538] [INFO] [timer.py:260:stop] epoch=2/micro_step=1152/global_step=5810, RunningAvgSamplesPerSec=8.219345115373931, CurrSamplesPerSec=8.177501206509527, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5819, Loss: 0.08493407815694809 +[2024-01-21 23:29:43,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=5820, skipped=0, lr=[1.3837920012271445e-06, 1.3837920012271445e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:29:43,438] [INFO] [timer.py:260:stop] epoch=2/micro_step=1162/global_step=5820, RunningAvgSamplesPerSec=8.219360544743685, CurrSamplesPerSec=8.216956296525353, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5829, Loss: 0.012305965647101402 +[2024-01-21 23:30:22,398] [INFO] [logging.py:96:log_dist] [Rank 0] step=5830, skipped=0, lr=[1.3607290708410204e-06, 1.3607290708410204e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:30:22,398] [INFO] [timer.py:260:stop] epoch=2/micro_step=1172/global_step=5830, RunningAvgSamplesPerSec=8.219354346780772, CurrSamplesPerSec=8.217554970905923, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5839, Loss: 0.022700833156704903 +[2024-01-21 23:31:01,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=5840, skipped=0, lr=[1.3378459099174734e-06, 1.3378459099174734e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:31:01,308] [INFO] [timer.py:260:stop] epoch=2/micro_step=1182/global_step=5840, RunningAvgSamplesPerSec=8.219366249960371, CurrSamplesPerSec=8.224370911068242, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5849, Loss: 0.03721282631158829 +[2024-01-21 23:31:40,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=5850, skipped=0, lr=[1.3151429946186322e-06, 1.3151429946186322e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:31:40,267] [INFO] [timer.py:260:stop] epoch=2/micro_step=1192/global_step=5850, RunningAvgSamplesPerSec=8.219360554381076, CurrSamplesPerSec=8.202871909394073, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5859, Loss: 0.021596966311335564 +[2024-01-21 23:32:19,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=5860, skipped=0, lr=[1.29262079735598e-06, 1.29262079735598e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:32:19,258] [INFO] [timer.py:260:stop] epoch=2/micro_step=1202/global_step=5860, RunningAvgSamplesPerSec=8.219343199005557, CurrSamplesPerSec=8.213191176183521, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5869, Loss: 0.018902115523815155 +[2024-01-21 23:32:58,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=5870, skipped=0, lr=[1.2702797867805649e-06, 1.2702797867805649e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:32:58,143] [INFO] [timer.py:260:stop] epoch=2/micro_step=1212/global_step=5870, RunningAvgSamplesPerSec=8.219364279974023, CurrSamplesPerSec=8.2215371165622, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5879, Loss: 0.022082746028900146 +[2024-01-21 23:33:37,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=5880, skipped=0, lr=[1.2481204277732107e-06, 1.2481204277732107e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:33:37,139] [INFO] [timer.py:260:stop] epoch=2/micro_step=1222/global_step=5880, RunningAvgSamplesPerSec=8.219345170465537, CurrSamplesPerSec=8.158371614464645, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5889, Loss: 0.020338241010904312 +[2024-01-21 23:34:16,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=5890, skipped=0, lr=[1.22614318143488e-06, 1.22614318143488e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:34:16,096] [INFO] [timer.py:260:stop] epoch=2/micro_step=1232/global_step=5890, RunningAvgSamplesPerSec=8.219340143879936, CurrSamplesPerSec=8.197994883831981, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5899, Loss: 0.021952249109745026 +[2024-01-21 23:34:55,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=5900, skipped=0, lr=[1.204348505077042e-06, 1.204348505077042e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:34:55,024] [INFO] [timer.py:260:stop] epoch=2/micro_step=1242/global_step=5900, RunningAvgSamplesPerSec=8.219345790743814, CurrSamplesPerSec=8.22789307873636, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5909, Loss: 0.024737587198615074 +[2024-01-21 23:35:33,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=5910, skipped=0, lr=[1.182736852212192e-06, 1.182736852212192e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:35:33,959] [INFO] [timer.py:260:stop] epoch=2/micro_step=1252/global_step=5910, RunningAvgSamplesPerSec=8.219348301054115, CurrSamplesPerSec=8.24929544526329, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5919, Loss: 0.016918428242206573 +[2024-01-21 23:36:12,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=5920, skipped=0, lr=[1.161308672544389e-06, 1.161308672544389e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:36:12,883] [INFO] [timer.py:260:stop] epoch=2/micro_step=1262/global_step=5920, RunningAvgSamplesPerSec=8.219355063364805, CurrSamplesPerSec=8.221194171024436, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5929, Loss: 0.06374916434288025 +[2024-01-21 23:36:51,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=5930, skipped=0, lr=[1.140064411959909e-06, 1.140064411959909e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:36:51,830] [INFO] [timer.py:260:stop] epoch=2/micro_step=1272/global_step=5930, RunningAvgSamplesPerSec=8.21935367870451, CurrSamplesPerSec=8.214617774807088, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5939, Loss: 0.018773935735225677 +[2024-01-21 23:37:30,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=5940, skipped=0, lr=[1.119004512517965e-06, 1.119004512517965e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:37:30,782] [INFO] [timer.py:260:stop] epoch=2/micro_step=1282/global_step=5940, RunningAvgSamplesPerSec=8.219350282957521, CurrSamplesPerSec=8.207516835570177, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5949, Loss: 0.03380183130502701 +[2024-01-21 23:38:09,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=5950, skipped=0, lr=[1.0981294124415075e-06, 1.0981294124415075e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:38:09,756] [INFO] [timer.py:260:stop] epoch=2/micro_step=1292/global_step=5950, RunningAvgSamplesPerSec=8.219339235085119, CurrSamplesPerSec=8.207998182003424, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5959, Loss: 0.01871536858379841 +[2024-01-21 23:38:48,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=5960, skipped=0, lr=[1.0774395461081089e-06, 1.0774395461081089e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:38:48,720] [INFO] [timer.py:260:stop] epoch=2/micro_step=1302/global_step=5960, RunningAvgSamplesPerSec=8.219331884580678, CurrSamplesPerSec=8.185326940189203, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5969, Loss: 0.015952009707689285 +[2024-01-21 23:39:27,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=5970, skipped=0, lr=[1.0569353440409213e-06, 1.0569353440409213e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:39:27,708] [INFO] [timer.py:260:stop] epoch=2/micro_step=1312/global_step=5970, RunningAvgSamplesPerSec=8.219315805252467, CurrSamplesPerSec=8.209984395749723, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5979, Loss: 0.019862456247210503 +[2024-01-21 23:40:06,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=5980, skipped=0, lr=[1.0366172328997182e-06, 1.0366172328997182e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:40:06,624] [INFO] [timer.py:260:stop] epoch=2/micro_step=1322/global_step=5980, RunningAvgSamplesPerSec=8.219325468878713, CurrSamplesPerSec=8.254770821062236, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5989, Loss: 0.07516815513372421 +[2024-01-21 23:40:45,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=5990, skipped=0, lr=[1.0164856354720187e-06, 1.0164856354720187e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:40:45,559] [INFO] [timer.py:260:stop] epoch=2/micro_step=1332/global_step=5990, RunningAvgSamplesPerSec=8.219328363807275, CurrSamplesPerSec=8.229226904938558, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 5999, Loss: 0.03975485637784004 +[2024-01-21 23:41:24,508] [INFO] [logging.py:96:log_dist] [Rank 0] step=6000, skipped=0, lr=[9.96540970664287e-07, 9.96540970664287e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:41:24,508] [INFO] [timer.py:260:stop] epoch=2/micro_step=1342/global_step=6000, RunningAvgSamplesPerSec=8.219326064700159, CurrSamplesPerSec=8.221041088567077, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6009, Loss: 0.015117738395929337 +[2024-01-21 23:42:03,437] [INFO] [logging.py:96:log_dist] [Rank 0] step=6010, skipped=0, lr=[9.767836534932241e-07, 9.767836534932241e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:42:03,438] [INFO] [timer.py:260:stop] epoch=2/micro_step=1352/global_step=6010, RunningAvgSamplesPerSec=8.21933092843714, CurrSamplesPerSec=8.201960596312551, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6019, Loss: 0.022888703271746635 +[2024-01-21 23:42:42,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=6020, skipped=0, lr=[9.572140950771115e-07, 9.572140950771115e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:42:42,392] [INFO] [timer.py:260:stop] epoch=2/micro_step=1362/global_step=6020, RunningAvgSamplesPerSec=8.219326908315743, CurrSamplesPerSec=8.218538193550401, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6029, Loss: 0.08284764736890793 +[2024-01-21 23:43:21,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=6030, skipped=0, lr=[9.378327026272871e-07, 9.378327026272871e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:43:21,362] [INFO] [timer.py:260:stop] epoch=2/micro_step=1372/global_step=6030, RunningAvgSamplesPerSec=8.219317710148024, CurrSamplesPerSec=8.218815994054822, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6039, Loss: 0.08206502348184586 +[2024-01-21 23:44:00,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=6040, skipped=0, lr=[9.186398794396389e-07, 9.186398794396389e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:44:00,306] [INFO] [timer.py:260:stop] epoch=2/micro_step=1382/global_step=6040, RunningAvgSamplesPerSec=8.219317133337398, CurrSamplesPerSec=8.20873612191262, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6049, Loss: 0.03285090625286102 +[2024-01-21 23:44:39,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=6050, skipped=0, lr=[8.996360248862434e-07, 8.996360248862434e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:44:39,241] [INFO] [timer.py:260:stop] epoch=2/micro_step=1392/global_step=6050, RunningAvgSamplesPerSec=8.219319764898406, CurrSamplesPerSec=8.222812460216218, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6059, Loss: 0.04427472874522209 +[2024-01-21 23:45:18,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=6060, skipped=0, lr=[8.80821534407027e-07, 8.80821534407027e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:45:18,190] [INFO] [timer.py:260:stop] epoch=2/micro_step=1402/global_step=6060, RunningAvgSamplesPerSec=8.219318042244316, CurrSamplesPerSec=8.22891913812783, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6069, Loss: 0.0753503069281578 +[2024-01-21 23:45:57,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=6070, skipped=0, lr=[8.621967995015645e-07, 8.621967995015645e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:45:57,129] [INFO] [timer.py:260:stop] epoch=2/micro_step=1412/global_step=6070, RunningAvgSamplesPerSec=8.2193193932554, CurrSamplesPerSec=8.250435887581697, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6079, Loss: 0.01602942869067192 +[2024-01-21 23:46:36,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=6080, skipped=0, lr=[8.437622077209073e-07, 8.437622077209073e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:46:36,131] [INFO] [timer.py:260:stop] epoch=2/micro_step=1422/global_step=6080, RunningAvgSamplesPerSec=8.219298984931214, CurrSamplesPerSec=8.205781646335584, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6089, Loss: 0.026716452091932297 +[2024-01-21 23:47:15,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=6090, skipped=0, lr=[8.255181426595427e-07, 8.255181426595427e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:47:15,115] [INFO] [timer.py:260:stop] epoch=2/micro_step=1432/global_step=6090, RunningAvgSamplesPerSec=8.219284828466577, CurrSamplesPerSec=8.195929884348843, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6099, Loss: 0.024157440289855003 +[2024-01-21 23:47:54,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=6100, skipped=0, lr=[8.074649839473925e-07, 8.074649839473925e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:47:54,091] [INFO] [timer.py:260:stop] epoch=2/micro_step=1442/global_step=6100, RunningAvgSamplesPerSec=8.219273168212807, CurrSamplesPerSec=8.219951044469573, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6109, Loss: 0.02378617785871029 +[2024-01-21 23:48:33,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=6110, skipped=0, lr=[7.896031072419263e-07, 7.896031072419263e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:48:33,134] [INFO] [timer.py:260:stop] epoch=2/micro_step=1452/global_step=6110, RunningAvgSamplesPerSec=8.219239015803181, CurrSamplesPerSec=8.234689810082948, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6119, Loss: 0.01676495373249054 +[2024-01-21 23:49:12,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=6120, skipped=0, lr=[7.719328842203355e-07, 7.719328842203355e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:49:12,103] [INFO] [timer.py:260:stop] epoch=2/micro_step=1462/global_step=6120, RunningAvgSamplesPerSec=8.219230057722465, CurrSamplesPerSec=8.225477754706855, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6129, Loss: 0.011398369446396828 +[2024-01-21 23:49:51,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=6130, skipped=0, lr=[7.54454682571808e-07, 7.54454682571808e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:49:51,050] [INFO] [timer.py:260:stop] epoch=2/micro_step=1472/global_step=6130, RunningAvgSamplesPerSec=8.219228798092315, CurrSamplesPerSec=8.192405520073045, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6139, Loss: 0.07595191150903702 +[2024-01-21 23:50:30,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=6140, skipped=0, lr=[7.371688659898712e-07, 7.371688659898712e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:50:30,012] [INFO] [timer.py:260:stop] epoch=2/micro_step=1482/global_step=6140, RunningAvgSamplesPerSec=8.21922269673471, CurrSamplesPerSec=8.174615968653574, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6149, Loss: 0.023832369595766068 +[2024-01-21 23:51:08,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=6150, skipped=0, lr=[7.20075794164824e-07, 7.20075794164824e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:51:08,980] [INFO] [timer.py:260:stop] epoch=2/micro_step=1492/global_step=6150, RunningAvgSamplesPerSec=8.219214282554384, CurrSamplesPerSec=8.190959132268471, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6159, Loss: 0.026711737737059593 +[2024-01-21 23:51:47,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=6160, skipped=0, lr=[7.031758227762575e-07, 7.031758227762575e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:51:47,946] [INFO] [timer.py:260:stop] epoch=2/micro_step=1502/global_step=6160, RunningAvgSamplesPerSec=8.219206642044782, CurrSamplesPerSec=8.223379238685508, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6169, Loss: 0.024021131917834282 +[2024-01-21 23:52:26,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=6170, skipped=0, lr=[6.864693034856473e-07, 6.864693034856473e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:52:26,876] [INFO] [timer.py:260:stop] epoch=2/micro_step=1512/global_step=6170, RunningAvgSamplesPerSec=8.219211098079978, CurrSamplesPerSec=8.188605906816822, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6179, Loss: 0.04429052770137787 +[2024-01-21 23:53:05,927] [INFO] [logging.py:96:log_dist] [Rank 0] step=6180, skipped=0, lr=[6.699565839290412e-07, 6.699565839290412e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:53:05,927] [INFO] [timer.py:260:stop] epoch=2/micro_step=1522/global_step=6180, RunningAvgSamplesPerSec=8.219174370250744, CurrSamplesPerSec=8.152013141593184, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6189, Loss: 0.017325280234217644 +[2024-01-21 23:53:44,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=6190, skipped=0, lr=[6.536380077098214e-07, 6.536380077098214e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:53:44,906] [INFO] [timer.py:260:stop] epoch=2/micro_step=1532/global_step=6190, RunningAvgSamplesPerSec=8.219162277630465, CurrSamplesPerSec=8.196066517622176, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6199, Loss: 0.01359492912888527 +[2024-01-21 23:54:23,839] [INFO] [logging.py:96:log_dist] [Rank 0] step=6200, skipped=0, lr=[6.375139143915588e-07, 6.375139143915588e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:54:23,840] [INFO] [timer.py:260:stop] epoch=2/micro_step=1542/global_step=6200, RunningAvgSamplesPerSec=8.219165768863547, CurrSamplesPerSec=8.203614944816584, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6209, Loss: 0.011713271029293537 +[2024-01-21 23:55:02,745] [INFO] [logging.py:96:log_dist] [Rank 0] step=6210, skipped=0, lr=[6.215846394909442e-07, 6.215846394909442e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:55:02,746] [INFO] [timer.py:260:stop] epoch=2/micro_step=1552/global_step=6210, RunningAvgSamplesPerSec=8.219178624127927, CurrSamplesPerSec=8.203301068629152, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6219, Loss: 0.0570458360016346 +[2024-01-21 23:55:41,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=6220, skipped=0, lr=[6.058505144708061e-07, 6.058505144708061e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:55:41,650] [INFO] [timer.py:260:stop] epoch=2/micro_step=1562/global_step=6220, RunningAvgSamplesPerSec=8.219192033120876, CurrSamplesPerSec=8.22959877850134, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6229, Loss: 0.05689249187707901 +[2024-01-21 23:56:20,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=6230, skipped=0, lr=[5.903118667332164e-07, 5.903118667332164e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:56:20,649] [INFO] [timer.py:260:stop] epoch=2/micro_step=1572/global_step=6230, RunningAvgSamplesPerSec=8.219173365901495, CurrSamplesPerSec=8.197536739597192, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6239, Loss: 0.012910223565995693 +[2024-01-21 23:56:59,640] [INFO] [logging.py:96:log_dist] [Rank 0] step=6240, skipped=0, lr=[5.749690196126767e-07, 5.749690196126767e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:56:59,640] [INFO] [timer.py:260:stop] epoch=2/micro_step=1582/global_step=6240, RunningAvgSamplesPerSec=8.219157361875695, CurrSamplesPerSec=8.223037650840187, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6249, Loss: 0.01848549023270607 +[2024-01-21 23:57:38,580] [INFO] [logging.py:96:log_dist] [Rank 0] step=6250, skipped=0, lr=[5.598222923693875e-07, 5.598222923693875e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:57:38,581] [INFO] [timer.py:260:stop] epoch=2/micro_step=1592/global_step=6250, RunningAvgSamplesPerSec=8.219158604501354, CurrSamplesPerSec=8.209554034820671, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6259, Loss: 0.015389622189104557 +[2024-01-21 23:58:17,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=6260, skipped=0, lr=[5.448720001826091e-07, 5.448720001826091e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:58:17,517] [INFO] [timer.py:260:stop] epoch=2/micro_step=1602/global_step=6260, RunningAvgSamplesPerSec=8.219161048777913, CurrSamplesPerSec=8.162537345365587, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6269, Loss: 0.019812066107988358 +[2024-01-21 23:58:56,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=6270, skipped=0, lr=[5.301184541441007e-07, 5.301184541441007e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:58:56,513] [INFO] [timer.py:260:stop] epoch=2/micro_step=1612/global_step=6270, RunningAvgSamplesPerSec=8.21914345122061, CurrSamplesPerSec=8.202254319777344, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6279, Loss: 0.03375754505395889 +[2024-01-21 23:59:35,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=6280, skipped=0, lr=[5.155619612516505e-07, 5.155619612516505e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-21 23:59:35,472] [INFO] [timer.py:260:stop] epoch=2/micro_step=1622/global_step=6280, RunningAvgSamplesPerSec=8.21913867873584, CurrSamplesPerSec=8.220545120421816, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6289, Loss: 0.023988140746951103 +[2024-01-22 00:00:14,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=6290, skipped=0, lr=[5.012028244026757e-07, 5.012028244026757e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:00:14,364] [INFO] [timer.py:260:stop] epoch=2/micro_step=1632/global_step=6290, RunningAvgSamplesPerSec=8.219156024171985, CurrSamplesPerSec=8.243302786574626, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6299, Loss: 0.014279213733971119 +[2024-01-22 00:00:53,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=6300, skipped=0, lr=[4.870413423879416e-07, 4.870413423879416e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:00:53,309] [INFO] [timer.py:260:stop] epoch=2/micro_step=1642/global_step=6300, RunningAvgSamplesPerSec=8.21915561003808, CurrSamplesPerSec=8.221438913374692, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6309, Loss: 0.0229865163564682 +[2024-01-22 00:01:32,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=6310, skipped=0, lr=[4.7307780988531946e-07, 4.7307780988531946e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:01:32,261] [INFO] [timer.py:260:stop] epoch=2/micro_step=1652/global_step=6310, RunningAvgSamplesPerSec=8.219152963474546, CurrSamplesPerSec=8.23407853353131, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6319, Loss: 0.04358433187007904 +[2024-01-22 00:02:11,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=6320, skipped=0, lr=[4.593125174536761e-07, 4.593125174536761e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:02:11,182] [INFO] [timer.py:260:stop] epoch=2/micro_step=1662/global_step=6320, RunningAvgSamplesPerSec=8.219160627186286, CurrSamplesPerSec=8.238243578210502, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6329, Loss: 0.025955243036150932 +[2024-01-22 00:02:50,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=6330, skipped=0, lr=[4.457457515268082e-07, 4.457457515268082e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:02:50,139] [INFO] [timer.py:260:stop] epoch=2/micro_step=1672/global_step=6330, RunningAvgSamplesPerSec=8.219156134565935, CurrSamplesPerSec=8.237261701780056, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6339, Loss: 0.030328722670674324 +[2024-01-22 00:03:29,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=6340, skipped=0, lr=[4.323777944075058e-07, 4.323777944075058e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:03:29,147] [INFO] [timer.py:260:stop] epoch=2/micro_step=1682/global_step=6340, RunningAvgSamplesPerSec=8.219134739276745, CurrSamplesPerSec=8.208153790944532, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6349, Loss: 0.04364308714866638 +[2024-01-22 00:04:08,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=6350, skipped=0, lr=[4.192089242616482e-07, 4.192089242616482e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:04:08,123] [INFO] [timer.py:260:stop] epoch=2/micro_step=1692/global_step=6350, RunningAvgSamplesPerSec=8.219123903279351, CurrSamplesPerSec=8.201200822284816, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6359, Loss: 0.022946437820792198 +[2024-01-22 00:04:47,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=6360, skipped=0, lr=[4.0623941511244713e-07, 4.0623941511244713e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:04:47,044] [INFO] [timer.py:260:stop] epoch=2/micro_step=1702/global_step=6360, RunningAvgSamplesPerSec=8.219131676365295, CurrSamplesPerSec=8.230152866854452, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6369, Loss: 0.017521433532238007 +[2024-01-22 00:05:26,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=6370, skipped=0, lr=[3.9346953683471857e-07, 3.9346953683471857e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:05:26,020] [INFO] [timer.py:260:stop] epoch=2/micro_step=1712/global_step=6370, RunningAvgSamplesPerSec=8.219121096494794, CurrSamplesPerSec=8.208633705996037, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6379, Loss: 0.013021346181631088 +[2024-01-22 00:06:04,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=6380, skipped=0, lr=[3.8089955514928554e-07, 3.8089955514928554e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:06:04,957] [INFO] [timer.py:260:stop] epoch=2/micro_step=1722/global_step=6380, RunningAvgSamplesPerSec=8.219123433066972, CurrSamplesPerSec=8.218781267964728, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6389, Loss: 0.06140157952904701 +[2024-01-22 00:06:43,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=6390, skipped=0, lr=[3.685297316174363e-07, 3.685297316174363e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:06:43,959] [INFO] [timer.py:260:stop] epoch=2/micro_step=1732/global_step=6390, RunningAvgSamplesPerSec=8.219104046408454, CurrSamplesPerSec=8.202353067714865, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6399, Loss: 0.012897399254143238 +[2024-01-22 00:07:23,205] [INFO] [logging.py:96:log_dist] [Rank 0] step=6400, skipped=0, lr=[3.5636032363549065e-07, 3.5636032363549065e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:07:23,205] [INFO] [timer.py:260:stop] epoch=2/micro_step=1742/global_step=6400, RunningAvgSamplesPerSec=8.219004462676585, CurrSamplesPerSec=7.6996544677109275, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6409, Loss: 0.1414138674736023 +[2024-01-22 00:08:02,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=6410, skipped=0, lr=[3.4439158442943655e-07, 3.4439158442943655e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:08:02,165] [INFO] [timer.py:260:stop] epoch=2/micro_step=1752/global_step=6410, RunningAvgSamplesPerSec=8.218999645124617, CurrSamplesPerSec=8.208613122713382, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6419, Loss: 0.019393354654312134 +[2024-01-22 00:08:41,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=6420, skipped=0, lr=[3.326237630496687e-07, 3.326237630496687e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:08:41,199] [INFO] [timer.py:260:stop] epoch=2/micro_step=1762/global_step=6420, RunningAvgSamplesPerSec=8.218970012641993, CurrSamplesPerSec=8.175339452576777, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6429, Loss: 0.04459045082330704 +[2024-01-22 00:09:20,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=6430, skipped=0, lr=[3.210571043657973e-07, 3.210571043657973e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:09:20,183] [INFO] [timer.py:260:stop] epoch=2/micro_step=1772/global_step=6430, RunningAvgSamplesPerSec=8.218957125231928, CurrSamplesPerSec=8.22638825103302, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6439, Loss: 0.0178780909627676 +[2024-01-22 00:09:59,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=6440, skipped=0, lr=[3.096918490615608e-07, 3.096918490615608e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:09:59,121] [INFO] [timer.py:260:stop] epoch=2/micro_step=1782/global_step=6440, RunningAvgSamplesPerSec=8.218959410063363, CurrSamplesPerSec=8.229353550098494, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6449, Loss: 0.01611294597387314 +[2024-01-22 00:10:38,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=6450, skipped=0, lr=[2.985282336298134e-07, 2.985282336298134e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:10:38,077] [INFO] [timer.py:260:stop] epoch=2/micro_step=1792/global_step=6450, RunningAvgSamplesPerSec=8.21895570671678, CurrSamplesPerSec=8.186687946804048, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6459, Loss: 0.0123255904763937 +[2024-01-22 00:11:17,090] [INFO] [logging.py:96:log_dist] [Rank 0] step=6460, skipped=0, lr=[2.875664903676045e-07, 2.875664903676045e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:11:17,090] [INFO] [timer.py:260:stop] epoch=2/micro_step=1802/global_step=6460, RunningAvgSamplesPerSec=8.218933412514223, CurrSamplesPerSec=8.229206218250924, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6469, Loss: 0.05105749890208244 +[2024-01-22 00:11:56,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=6470, skipped=0, lr=[2.768068473713459e-07, 2.768068473713459e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:11:56,126] [INFO] [timer.py:260:stop] epoch=2/micro_step=1812/global_step=6470, RunningAvgSamplesPerSec=8.218903667432148, CurrSamplesPerSec=8.236810784451986, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6479, Loss: 0.05995270982384682 +[2024-01-22 00:12:35,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=6480, skipped=0, lr=[2.662495285320632e-07, 2.662495285320632e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:12:35,079] [INFO] [timer.py:260:stop] epoch=2/micro_step=1822/global_step=6480, RunningAvgSamplesPerSec=8.218901001659136, CurrSamplesPerSec=8.221810587031408, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6489, Loss: 0.016942407935857773 +[2024-01-22 00:13:14,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=6490, skipped=0, lr=[2.5589475353073987e-07, 2.5589475353073987e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:13:14,071] [INFO] [timer.py:260:stop] epoch=2/micro_step=1832/global_step=6490, RunningAvgSamplesPerSec=8.218885815349786, CurrSamplesPerSec=8.20049780589285, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6499, Loss: 0.09971711784601212 +[2024-01-22 00:13:53,045] [INFO] [logging.py:96:log_dist] [Rank 0] step=6500, skipped=0, lr=[2.4574273783374623e-07, 2.4574273783374623e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:13:53,046] [INFO] [timer.py:260:stop] epoch=2/micro_step=1842/global_step=6500, RunningAvgSamplesPerSec=8.21887629462371, CurrSamplesPerSec=8.221032024652978, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6509, Loss: 0.020137757062911987 +[2024-01-22 00:14:32,028] [INFO] [logging.py:96:log_dist] [Rank 0] step=6510, skipped=0, lr=[2.3579369268835196e-07, 2.3579369268835196e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:14:32,029] [INFO] [timer.py:260:stop] epoch=2/micro_step=1852/global_step=6510, RunningAvgSamplesPerSec=8.218864008663495, CurrSamplesPerSec=8.2427509759645, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6519, Loss: 0.025835387408733368 +[2024-01-22 00:15:10,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=6520, skipped=0, lr=[2.26047825118334e-07, 2.26047825118334e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:15:10,974] [INFO] [timer.py:260:stop] epoch=2/micro_step=1862/global_step=6520, RunningAvgSamplesPerSec=8.218864034450817, CurrSamplesPerSec=8.218269469628535, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6529, Loss: 0.1067107766866684 +[2024-01-22 00:15:49,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=6530, skipped=0, lr=[2.165053379196691e-07, 2.165053379196691e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:15:49,950] [INFO] [timer.py:260:stop] epoch=2/micro_step=1872/global_step=6530, RunningAvgSamplesPerSec=8.21885400218745, CurrSamplesPerSec=8.211278261083425, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6539, Loss: 0.03827182203531265 +[2024-01-22 00:16:28,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=6540, skipped=0, lr=[2.0716642965630917e-07, 2.0716642965630917e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:16:28,946] [INFO] [timer.py:260:stop] epoch=2/micro_step=1882/global_step=6540, RunningAvgSamplesPerSec=8.218837644029401, CurrSamplesPerSec=8.222733873103094, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6549, Loss: 0.018232209607958794 +[2024-01-22 00:17:07,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=6550, skipped=0, lr=[1.9803129465605808e-07, 1.9803129465605808e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:17:07,929] [INFO] [timer.py:260:stop] epoch=2/micro_step=1892/global_step=6550, RunningAvgSamplesPerSec=8.218825417323115, CurrSamplesPerSec=8.178368719909468, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6559, Loss: 0.020401109009981155 +[2024-01-22 00:17:46,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=6560, skipped=0, lr=[1.8910012300651592e-07, 1.8910012300651592e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:17:46,910] [INFO] [timer.py:260:stop] epoch=2/micro_step=1902/global_step=6560, RunningAvgSamplesPerSec=8.218813723928882, CurrSamplesPerSec=8.229486758775693, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6569, Loss: 0.0342131182551384 +[2024-01-22 00:18:25,977] [INFO] [logging.py:96:log_dist] [Rank 0] step=6570, skipped=0, lr=[1.8037310055113778e-07, 1.8037310055113778e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:18:25,977] [INFO] [timer.py:260:stop] epoch=2/micro_step=1912/global_step=6570, RunningAvgSamplesPerSec=8.218774654369417, CurrSamplesPerSec=8.195828788668729, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6579, Loss: 0.1924186646938324 +[2024-01-22 00:19:04,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=6580, skipped=0, lr=[1.718504088853512e-07, 1.718504088853512e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:19:04,928] [INFO] [timer.py:260:stop] epoch=2/micro_step=1922/global_step=6580, RunningAvgSamplesPerSec=8.218772932651166, CurrSamplesPerSec=8.209099116549712, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6589, Loss: 0.04612429440021515 +[2024-01-22 00:19:43,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=6590, skipped=0, lr=[1.63532225352796e-07, 1.63532225352796e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:19:43,888] [INFO] [timer.py:260:stop] epoch=2/micro_step=1932/global_step=6590, RunningAvgSamplesPerSec=8.218768663584697, CurrSamplesPerSec=8.219369638021334, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6599, Loss: 0.05914175510406494 +[2024-01-22 00:20:22,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=6600, skipped=0, lr=[1.5541872304161266e-07, 1.5541872304161266e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:20:22,842] [INFO] [timer.py:260:stop] epoch=2/micro_step=1942/global_step=6600, RunningAvgSamplesPerSec=8.218765732310212, CurrSamplesPerSec=8.239743135687268, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6609, Loss: 0.04632081463932991 +[2024-01-22 00:21:01,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=6610, skipped=0, lr=[1.4751007078085854e-07, 1.4751007078085854e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:21:01,843] [INFO] [timer.py:260:stop] epoch=2/micro_step=1952/global_step=6610, RunningAvgSamplesPerSec=8.218748137126985, CurrSamplesPerSec=8.193367728316463, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6619, Loss: 0.014974902383983135 +[2024-01-22 00:21:40,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=6620, skipped=0, lr=[1.3980643313698528e-07, 1.3980643313698528e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:21:40,804] [INFO] [timer.py:260:stop] epoch=2/micro_step=1962/global_step=6620, RunningAvgSamplesPerSec=8.218743275703382, CurrSamplesPerSec=8.216631338390199, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6629, Loss: 0.024507585912942886 +[2024-01-22 00:22:19,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=6630, skipped=0, lr=[1.323079704104191e-07, 1.323079704104191e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:22:19,753] [INFO] [timer.py:260:stop] epoch=2/micro_step=1972/global_step=6630, RunningAvgSamplesPerSec=8.218742358451195, CurrSamplesPerSec=8.20587646565903, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6639, Loss: 0.01795290783047676 +[2024-01-22 00:22:58,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=6640, skipped=0, lr=[1.250148386322192e-07, 1.250148386322192e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:22:58,700] [INFO] [timer.py:260:stop] epoch=2/micro_step=1982/global_step=6640, RunningAvgSamplesPerSec=8.218741918498107, CurrSamplesPerSec=8.18788456851674, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6649, Loss: 0.038139283657073975 +[2024-01-22 00:23:37,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=6650, skipped=0, lr=[1.1792718956083915e-07, 1.1792718956083915e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:23:37,685] [INFO] [timer.py:260:stop] epoch=2/micro_step=1992/global_step=6650, RunningAvgSamplesPerSec=8.218729622540447, CurrSamplesPerSec=8.203907783875021, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6659, Loss: 0.022854287177324295 +[2024-01-22 00:24:16,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=6660, skipped=0, lr=[1.1104517067896281e-07, 1.1104517067896281e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:24:16,635] [INFO] [timer.py:260:stop] epoch=2/micro_step=2002/global_step=6660, RunningAvgSamplesPerSec=8.218728123030798, CurrSamplesPerSec=8.20616796100543, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6669, Loss: 0.01434221863746643 +[2024-01-22 00:24:55,621] [INFO] [logging.py:96:log_dist] [Rank 0] step=6670, skipped=0, lr=[1.0436892519043673e-07, 1.0436892519043673e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:24:55,622] [INFO] [timer.py:260:stop] epoch=2/micro_step=2012/global_step=6670, RunningAvgSamplesPerSec=8.218715246920972, CurrSamplesPerSec=8.203928845024109, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6679, Loss: 0.0356341153383255 +[2024-01-22 00:25:34,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=6680, skipped=0, lr=[9.789859201729257e-08, 9.789859201729257e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:25:34,565] [INFO] [timer.py:260:stop] epoch=2/micro_step=2022/global_step=6680, RunningAvgSamplesPerSec=8.218715973849426, CurrSamplesPerSec=8.212060001549194, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6689, Loss: 0.010973628610372543 +[2024-01-22 00:26:13,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=6690, skipped=0, lr=[9.163430579685384e-08, 9.163430579685384e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:26:13,624] [INFO] [timer.py:260:stop] epoch=2/micro_step=2032/global_step=6690, RunningAvgSamplesPerSec=8.21868055529426, CurrSamplesPerSec=8.205500211407392, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6699, Loss: 0.06653212755918503 +[2024-01-22 00:26:52,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=6700, skipped=0, lr=[8.557619687893481e-08, 8.557619687893481e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:26:52,618] [INFO] [timer.py:260:stop] epoch=2/micro_step=2042/global_step=6700, RunningAvgSamplesPerSec=8.218665362046103, CurrSamplesPerSec=8.20968760759647, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6709, Loss: 0.012536714784801006 +[2024-01-22 00:27:31,613] [INFO] [logging.py:96:log_dist] [Rank 0] step=6710, skipped=0, lr=[7.972439132313048e-08, 7.972439132313048e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:27:31,613] [INFO] [timer.py:260:stop] epoch=2/micro_step=2052/global_step=6710, RunningAvgSamplesPerSec=8.218649920510293, CurrSamplesPerSec=8.197216820055097, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6719, Loss: 0.017311861738562584 +[2024-01-22 00:28:10,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=6720, skipped=0, lr=[7.407901089619086e-08, 7.407901089619086e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:28:10,568] [INFO] [timer.py:260:stop] epoch=2/micro_step=2062/global_step=6720, RunningAvgSamplesPerSec=8.218647303155038, CurrSamplesPerSec=8.23508390423703, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6729, Loss: 0.019088055938482285 +[2024-01-22 00:28:49,579] [INFO] [logging.py:96:log_dist] [Rank 0] step=6730, skipped=0, lr=[6.864017306948523e-08, 6.864017306948523e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:28:49,580] [INFO] [timer.py:260:stop] epoch=2/micro_step=2072/global_step=6730, RunningAvgSamplesPerSec=8.218626634310645, CurrSamplesPerSec=8.175594918302362, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6739, Loss: 0.022980017587542534 +[2024-01-22 00:29:28,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=6740, skipped=0, lr=[6.340799101656525e-08, 6.340799101656525e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:29:28,551] [INFO] [timer.py:260:stop] epoch=2/micro_step=2082/global_step=6740, RunningAvgSamplesPerSec=8.218618831885035, CurrSamplesPerSec=8.178954308292322, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6749, Loss: 0.06134231016039848 +[2024-01-22 00:30:07,555] [INFO] [logging.py:96:log_dist] [Rank 0] step=6750, skipped=0, lr=[5.838257361080124e-08, 5.838257361080124e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:30:07,556] [INFO] [timer.py:260:stop] epoch=2/micro_step=2092/global_step=6750, RunningAvgSamplesPerSec=8.218600577806127, CurrSamplesPerSec=8.247187294077975, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6759, Loss: 0.027232782915234566 +[2024-01-22 00:30:46,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=6760, skipped=0, lr=[5.356402542312289e-08, 5.356402542312289e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:30:46,539] [INFO] [timer.py:260:stop] epoch=2/micro_step=2102/global_step=6760, RunningAvgSamplesPerSec=8.218589135819869, CurrSamplesPerSec=8.205846865740792, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6769, Loss: 0.02103675715625286 +[2024-01-22 00:31:25,494] [INFO] [logging.py:96:log_dist] [Rank 0] step=6770, skipped=0, lr=[4.8952446719839896e-08, 4.8952446719839896e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:31:25,495] [INFO] [timer.py:260:stop] epoch=2/micro_step=2112/global_step=6770, RunningAvgSamplesPerSec=8.21858632806462, CurrSamplesPerSec=8.186691442267904, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6779, Loss: 0.04041306674480438 +[2024-01-22 00:32:04,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=6780, skipped=0, lr=[4.454793346055697e-08, 4.454793346055697e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:32:04,521] [INFO] [timer.py:260:stop] epoch=2/micro_step=2122/global_step=6780, RunningAvgSamplesPerSec=8.21856147659439, CurrSamplesPerSec=8.208132206126066, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6789, Loss: 0.030824165791273117 +[2024-01-22 00:32:43,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=6790, skipped=0, lr=[4.035057729617764e-08, 4.035057729617764e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:32:43,472] [INFO] [timer.py:260:stop] epoch=2/micro_step=2132/global_step=6790, RunningAvgSamplesPerSec=8.218560129683842, CurrSamplesPerSec=8.230725705104332, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6799, Loss: 0.09942390024662018 +[2024-01-22 00:33:22,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=6800, skipped=0, lr=[3.6360465566994685e-08, 3.6360465566994685e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:33:22,455] [INFO] [timer.py:260:stop] epoch=2/micro_step=2142/global_step=6800, RunningAvgSamplesPerSec=8.218548945072937, CurrSamplesPerSec=8.189262415146011, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6809, Loss: 0.019095921888947487 +[2024-01-22 00:34:01,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=6810, skipped=0, lr=[3.257768130087713e-08, 3.257768130087713e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:34:01,519] [INFO] [timer.py:260:stop] epoch=2/micro_step=2152/global_step=6810, RunningAvgSamplesPerSec=8.218512639025874, CurrSamplesPerSec=8.181431152932628, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6819, Loss: 0.033030033111572266 +[2024-01-22 00:34:40,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=6820, skipped=0, lr=[2.9002303211537186e-08, 2.9002303211537186e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:34:40,444] [INFO] [timer.py:260:stop] epoch=2/micro_step=2162/global_step=6820, RunningAvgSamplesPerSec=8.218519730753144, CurrSamplesPerSec=8.220684589679406, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6829, Loss: 0.04316024109721184 +[2024-01-22 00:35:19,404] [INFO] [logging.py:96:log_dist] [Rank 0] step=6830, skipped=0, lr=[2.5634405696896013e-08, 2.5634405696896013e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:35:19,404] [INFO] [timer.py:260:stop] epoch=2/micro_step=2172/global_step=6830, RunningAvgSamplesPerSec=8.218515540596268, CurrSamplesPerSec=8.207526371604095, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6839, Loss: 0.014131908304989338 +[2024-01-22 00:35:58,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=6840, skipped=0, lr=[2.2474058837536062e-08, 2.2474058837536062e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:35:58,360] [INFO] [timer.py:260:stop] epoch=2/micro_step=2182/global_step=6840, RunningAvgSamplesPerSec=8.218512727589204, CurrSamplesPerSec=8.218990635507376, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6849, Loss: 0.03492780402302742 +[2024-01-22 00:36:37,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=6850, skipped=0, lr=[1.9521328395237794e-08, 1.9521328395237794e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:36:37,350] [INFO] [timer.py:260:stop] epoch=2/micro_step=2192/global_step=6850, RunningAvgSamplesPerSec=8.218499642741836, CurrSamplesPerSec=8.21203688883472, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6859, Loss: 0.0235967505723238 +[2024-01-22 00:37:16,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=6860, skipped=0, lr=[1.677627581161745e-08, 1.677627581161745e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:37:16,357] [INFO] [timer.py:260:stop] epoch=2/micro_step=2202/global_step=6860, RunningAvgSamplesPerSec=8.218481146990689, CurrSamplesPerSec=8.209263805200695, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6869, Loss: 0.015142410062253475 +[2024-01-22 00:37:55,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=6870, skipped=0, lr=[1.4238958206845844e-08, 1.4238958206845844e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:37:55,365] [INFO] [timer.py:260:stop] epoch=2/micro_step=2212/global_step=6870, RunningAvgSamplesPerSec=8.218462547940307, CurrSamplesPerSec=8.1925890423518, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6879, Loss: 0.029446352273225784 +[2024-01-22 00:38:34,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=6880, skipped=0, lr=[1.190942837846043e-08, 1.190942837846043e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:38:34,350] [INFO] [timer.py:260:stop] epoch=2/micro_step=2222/global_step=6880, RunningAvgSamplesPerSec=8.218450838634775, CurrSamplesPerSec=8.23417148213117, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6889, Loss: 0.014533378183841705 +[2024-01-22 00:39:13,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=6890, skipped=0, lr=[9.78773480026396e-09, 9.78773480026396e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:39:13,342] [INFO] [timer.py:260:stop] epoch=2/micro_step=2232/global_step=6890, RunningAvgSamplesPerSec=8.218437066340277, CurrSamplesPerSec=8.218373132580057, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6899, Loss: 0.016597239300608635 +[2024-01-22 00:39:52,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=6900, skipped=0, lr=[7.873921621319725e-09, 7.873921621319725e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:39:52,372] [INFO] [timer.py:260:stop] epoch=2/micro_step=2242/global_step=6900, RunningAvgSamplesPerSec=8.218411670365718, CurrSamplesPerSec=8.156050455341617, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6909, Loss: 0.04214925318956375 +[2024-01-22 00:40:31,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=6910, skipped=0, lr=[6.168028665028969e-09, 6.168028665028969e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:40:31,343] [INFO] [timer.py:260:stop] epoch=2/micro_step=2252/global_step=6910, RunningAvgSamplesPerSec=8.21840444866672, CurrSamplesPerSec=8.219625346049819, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6919, Loss: 0.047963790595531464 +[2024-01-22 00:41:10,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=6920, skipped=0, lr=[4.67009142830932e-09, 4.67009142830932e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:41:10,310] [INFO] [timer.py:260:stop] epoch=2/micro_step=2262/global_step=6920, RunningAvgSamplesPerSec=8.218398599764226, CurrSamplesPerSec=8.231668664514572, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6929, Loss: 0.05265253037214279 +[2024-01-22 00:41:49,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=6930, skipped=0, lr=[3.380141080844279e-09, 3.380141080844279e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:41:49,348] [INFO] [timer.py:260:stop] epoch=2/micro_step=2272/global_step=6930, RunningAvgSamplesPerSec=8.218370872791526, CurrSamplesPerSec=8.209835245721495, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6939, Loss: 0.02457846701145172 +[2024-01-22 00:42:28,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=6940, skipped=0, lr=[2.2982044644481726e-09, 2.2982044644481726e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:42:28,309] [INFO] [timer.py:260:stop] epoch=2/micro_step=2282/global_step=6940, RunningAvgSamplesPerSec=8.218366890801486, CurrSamplesPerSec=8.199312020747994, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6949, Loss: 0.0904906764626503 +[2024-01-22 00:43:07,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=6950, skipped=0, lr=[1.4243040924954988e-09, 1.4243040924954988e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:43:07,250] [INFO] [timer.py:260:stop] epoch=2/micro_step=2292/global_step=6950, RunningAvgSamplesPerSec=8.218368852855866, CurrSamplesPerSec=8.23627551059544, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6959, Loss: 0.02865780144929886 +[2024-01-22 00:43:46,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=6960, skipped=0, lr=[7.584581494635146e-10, 7.584581494635146e-10], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:43:46,282] [INFO] [timer.py:260:stop] epoch=2/micro_step=2302/global_step=6960, RunningAvgSamplesPerSec=8.218343250494074, CurrSamplesPerSec=8.179812159762932, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6969, Loss: 0.021456371992826462 +[2024-01-22 00:44:25,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=6970, skipped=0, lr=[3.0068049054254864e-10, 3.0068049054254864e-10], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:44:25,284] [INFO] [timer.py:260:stop] epoch=2/micro_step=2312/global_step=6970, RunningAvgSamplesPerSec=8.21832686795762, CurrSamplesPerSec=8.188987108502804, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +Epoch: 2, Total Step: 6979, Loss: 0.010015795938670635 +[2024-01-22 00:45:04,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=6980, skipped=0, lr=[5.098064135733438e-11, 5.098064135733438e-11], mom=[(0.9, 0.95), (0.9, 0.95)] +[2024-01-22 00:45:04,294] [INFO] [timer.py:260:stop] epoch=2/micro_step=2322/global_step=6980, RunningAvgSamplesPerSec=8.218308126981325, CurrSamplesPerSec=8.231945333854826, MemAllocated=14.54GB, MaxMemAllocated=25.63GB +***** Evaluating perplexity, Epoch 3/3 ***** +ppl: 1.0083154439926147 +eval loss: 0.008281068876385689 +saving the final model ... +[2024-01-22 00:45:34,628] [INFO] [launch.py:347:main] Process 3720140 exits successfully. +[2024-01-22 00:45:34,628] [INFO] [launch.py:347:main] Process 3720141 exits successfully. +[2024-01-22 00:45:34,629] [INFO] [launch.py:347:main] Process 3720142 exits successfully. +[2024-01-22 00:45:34,629] [INFO] [launch.py:347:main] Process 3720143 exits successfully. +[2024-01-22 00:45:34,629] [INFO] [launch.py:347:main] Process 3720145 exits successfully. +[2024-01-22 00:45:35,630] [INFO] [launch.py:347:main] Process 3720144 exits successfully. +[2024-01-22 00:45:35,631] [INFO] [launch.py:347:main] Process 3720146 exits successfully. +[2024-01-22 00:45:41,637] [INFO] [launch.py:347:main] Process 3720139 exits successfully.