diff --git a/202311100214.log b/202311100214.log new file mode 100644 index 0000000000000000000000000000000000000000..533fe80578792a6a2a5e07c50fc063341c7e848d --- /dev/null +++ b/202311100214.log @@ -0,0 +1,10677 @@ +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +FlashAttention-2 is not installed, ignore this if you are not using FlashAttention. +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 1, device: cuda:1, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 4, device: cuda:4, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 3, device: cuda:3, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-47_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-47_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-48_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 5, device: cuda:5, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-47_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 6, device: cuda:6, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - WARNING - llmtuner.tuner.core.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +[INFO|training_args.py:1345] 2023-11-12 02:44:48,105 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors! +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-47_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +[INFO|training_args.py:1798] 2023-11-12 02:44:48,105 >> PyTorch: setting up devices +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 7, device: cuda:7, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-48_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +/home/data/condaEnv/llama-factory/lib/python3.10/site-packages/transformers/training_args.py:1711: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 2, device: cuda:2, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Process rank: 0, device: cuda:0, n_gpu: 1 + distributed training: True, compute dtype: torch.float16 +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-47_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +11/12/2023 02:44:48 - INFO - llmtuner.tuner.core.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/runs/Nov12_02-44-47_k8s-node1, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=4, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=/home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned, +save_on_each_node=False, +save_safetensors=False, +save_steps=200, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +11/12/2023 02:44:48 - INFO - llmtuner.dsets.loader - Loading dataset /home/hz/projects/LLaMA-Factory/data/wenshu_train.json... +11/12/2023 02:44:48 - WARNING - llmtuner.dsets.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +Using custom data configuration default-6a98bd49aacf7ef0 +Loading Dataset Infos from /home/data/condaEnv/llama-factory/lib/python3.10/site-packages/datasets/packaged_modules/json +Overwrite dataset info from restored data version if exists. +Loading Dataset info from /home/hz/.cache/huggingface/datasets/json/default-6a98bd49aacf7ef0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96 +Found cached dataset json (/home/hz/.cache/huggingface/datasets/json/default-6a98bd49aacf7ef0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96) +Loading Dataset info from /home/hz/.cache/huggingface/datasets/json/default-6a98bd49aacf7ef0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96 +[INFO|tokenization_utils_base.py:2041] 2023-11-12 02:44:49,194 >> loading file tokenizer.model +[INFO|tokenization_utils_base.py:2041] 2023-11-12 02:44:49,194 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2041] 2023-11-12 02:44:49,194 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2041] 2023-11-12 02:44:49,194 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2041] 2023-11-12 02:44:49,194 >> loading file tokenizer.json +[INFO|configuration_utils.py:713] 2023-11-12 02:44:49,377 >> loading configuration file /home/hz/projects/chatglm3-6b-32k/config.json +[INFO|configuration_utils.py:713] 2023-11-12 02:44:49,378 >> loading configuration file /home/hz/projects/chatglm3-6b-32k/config.json +[INFO|configuration_utils.py:775] 2023-11-12 02:44:49,379 >> Model config ChatGLMConfig { + "_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMModel" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": null, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "rope_ratio": 50, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 65024 +} + +[INFO|modeling_utils.py:2990] 2023-11-12 02:44:49,413 >> loading weights file /home/hz/projects/chatglm3-6b-32k/pytorch_model.bin.index.json +[INFO|modeling_utils.py:1220] 2023-11-12 02:44:49,413 >> Instantiating ChatGLMForConditionalGeneration model under default dtype torch.float16. +[INFO|configuration_utils.py:770] 2023-11-12 02:44:49,414 >> Generate config GenerationConfig { + "eos_token_id": 2, + "pad_token_id": 0 +} + + +Loading checkpoint shards: 0%| | 0/7 [00:00> All model checkpoint weights were used when initializing ChatGLMForConditionalGeneration. + +[INFO|modeling_utils.py:3783] 2023-11-12 02:44:59,427 >> All the weights of ChatGLMForConditionalGeneration were initialized from the model checkpoint at /home/hz/projects/chatglm3-6b-32k. +If your task is similar to the task the model of the checkpoint was trained on, you can already use ChatGLMForConditionalGeneration for predictions without further training. +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.adapter - Fine-tuning method: LoRA +[INFO|modeling_utils.py:3352] 2023-11-12 02:44:59,430 >> Generation config file not found, using a generation config created from the model config. +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.utils - Gradient checkpointing enabled. +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.adapter - Fine-tuning method: LoRA + +Loading checkpoint shards: 100%|██████████| 7/7 [00:09<00:00, 1.30s/it] +Loading checkpoint shards: 100%|██████████| 7/7 [00:09<00:00, 1.39s/it] +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.utils - Gradient checkpointing enabled. +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.adapter - Fine-tuning method: LoRA +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.loader - trainable params: 1949696 || all params: 6245533696 || trainable%: 0.0312 +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.loader - trainable params: 1949696 || all params: 6245533696 || trainable%: 0.0312 +[INFO|tokenization_utils_base.py:952] 2023-11-12 02:44:59,481 >> Assigning ['<|user|>', '<|observation|>'] to the additional_special_tokens key of the tokenizer +11/12/2023 02:44:59 - INFO - llmtuner.tuner.core.loader - trainable params: 1949696 || all params: 6245533696 || trainable%: 0.0312 +[INFO|tokenization_utils.py:493] 2023-11-12 02:44:59,596 >> Adding <|user|> to the vocabulary +[INFO|tokenization_utils.py:493] 2023-11-12 02:44:59,596 >> Adding <|observation|> to the vocabulary + +Running tokenizer on dataset: 0%| | 0/382000 [00:00 + 作为一个法律专家,你需要根据提供的法律案件描述和相关法条来生成法律文书的判决结果。你需要充分调用你的法律知识和推理能力。 +在JSON格式的法律案件中,“JudgeResult”是需要生成的判决结果,它是根据“JudgeAccusation”(原告被告指控)和“JudgeReason”(法院的推理归纳过程)得出的。“Case”则是案件的标题。 +现在给你一个新的案件如下。请根据“JudgeAccusation”和“JudgeReason”字段、相关法律法条和其他有用信息,得出该案件的判决结果“JudgeResult”。 +{'Case': '6585杨杰与潘宗锡、时维娟排除妨害纠纷一审民事判决书', 'JudgeAccusation': '原告杨杰向本院提出诉讼请求:潘宗锡、时维娟自坐落于无锡市惠山区的房屋迁出。事实和理由:杨杰购买坐落于无锡市惠山区的房屋,并于2018年4月19日办理存量房买卖转移登记。但至今潘宗锡、时维娟居住在该房屋内拒绝搬出。\n被告潘宗锡、时维娟均未作答辩。\n本院经审理查明如下事实:潘宗锡、时维娟系夫妻关系。2017年5月3日,潘宗锡、时维娟向江苏省无锡市梁溪公证处申请公证居民身份证、居民户口簿和结婚证,用于办理出售无锡市惠山区的房屋(面积为89.71平方米)。同日,潘宗锡、时维娟将上述房屋抵押给邓建峰,债权数额为40万元。2017年5月4日,潘宗锡向王芳借款30万元,年利率15%。2018年4月18日,王芳代理潘宗锡、时维娟与杨杰签订无锡市存量房买卖合同(合同备案号:201804180241),将上述房屋出售给杨杰。同日,王芳向杨杰出具收条明确已收到房款。2018年4月19日,上述房屋转移登记于杨杰名下。\n庭审中,杨杰称因潘宗锡、时维娟拒绝搬出上述房屋,其多次向洛社派出所报警,民警也多次上门处理,但潘宗锡、时维娟拒绝开门。所以,杨杰至今无法占有、使用该房屋', 'JudgeReason': '本院认为:不动产物权的设立、变更、转让和消灭,经依法登记,发生效力。妨害物权或可能妨害物权的,权利人可以排除妨害或者消除危险。本案中,杨杰为无锡市惠山区的房屋所有权人,对该房屋享有占有、使用、收益、处分的权利。潘宗锡、时维娟无权占有上述房屋,其行为侵犯了杨杰的物权,应迁出该房屋。综上,依照《中华人民共和国物权法》第九条、第三十五条、《中华人民共和国民事诉讼法》第一百四十四条之规定,判决如下'}<|assistant|> 潘宗锡、时维娟于本判决生效后十五日内迁出坐落于无锡市惠山区的房屋。 +本案受理费减半收取40元,由潘宗锡、时维娟负担。 +如不服本判决,可在判决书送达之日起十五日内,向本院递交上诉状,并按对方当事人的人数提出副本,上诉于江苏省无锡市中级人民法院。同时,根据《诉讼费用交纳办法》的规定,向该院预交上诉案件受理费 +label_ids: +[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 30910, 56707, 55452, 56700, 31201, 54554, 55094, 57379, 54579, 54613, 35939, 37632, 54585, 33628, 37555, 56081, 54557, 48189, 36932, 54598, 55660, 54747, 34023, 33507, 31155, 13, 39002, 35783, 55000, 55325, 55205, 37694, 30972, 30940, 54751, 31123, 54781, 56707, 55452, 56700, 31201, 54554, 55094, 57379, 35022, 31155, 13, 54627, 40873, 54613, 35939, 31123, 36575, 35939, 54755, 43922, 38079, 33628, 37555, 31123, 54759, 42503, 45070, 39829, 55191, 31123, 54724, 55194, 32260, 35469, 47633, 32013, 51150, 31123, 39829, 54579, 34773, 36932, 54598, 44179, 31155, 31701, 31123, 31793, 54611, 33817, 32974, 54745, 55428, 32355, 54612, 34900, 31123, 54759, 54960, 54700, 55135, 54745, 39829, 32722, 35783, 55000, 2] +labels: +潘宗锡、时维娟于本判决生效后十五日内迁出坐落于无锡市惠山区的房屋。 +本案受理费减半收取40元,由潘宗锡、时维娟负担。 +如不服本判决,可在判决书送达之日起十五日内,向本院递交上诉状,并按对方当事人的人数提出副本,上诉于江苏省无锡市中级人民法院。同时,根据《诉讼费用交纳办法》的规定,向该院预交上诉案件受理费 + +Running tokenizer on dataset: 0%| | 0/382000 [00:00> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors! +[INFO|training_args.py:1798] 2023-11-12 03:03:21,435 >> PyTorch: setting up devices +Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. + +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<20:13, 314.05 examples/s] +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<20:28, 310.23 examples/s] +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<20:33, 308.80 examples/s] +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<20:35, 308.35 examples/s] +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<21:06, 300.90 examples/s] +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<21:25, 296.40 examples/s] +Running tokenizer on dataset: 0%| | 1000/382000 [00:03<21:35, 294.16 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<20:01, 316.18 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<20:14, 312.83 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<20:16, 312.49 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<20:17, 312.20 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<20:47, 304.51 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<20:59, 301.59 examples/s] +Running tokenizer on dataset: 1%| | 2000/382000 [00:06<21:07, 299.79 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<19:37, 321.97 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<19:51, 318.16 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<19:52, 317.74 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<20:14, 312.19 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<20:32, 307.49 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<20:44, 304.54 examples/s] +Running tokenizer on dataset: 1%| | 3000/382000 [00:09<20:50, 303.11 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:12<19:15, 327.18 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:12<19:27, 323.80 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:12<19:30, 322.92 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:12<19:53, 316.85 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:12<20:15, 311.06 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:13<20:32, 306.66 examples/s] +Running tokenizer on dataset: 1%| | 4000/382000 [00:13<20:37, 305.48 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:15<19:09, 327.98 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:15<19:19, 325.08 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:15<19:29, 322.48 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:15<19:42, 318.78 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:16<20:08, 311.89 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:16<20:36, 305.02 examples/s] +Running tokenizer on dataset: 1%|▏ | 5000/382000 [00:16<20:43, 303.21 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:18<19:28, 321.84 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:18<19:23, 323.12 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:18<19:30, 321.29 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:18<19:45, 317.03 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:19<20:17, 308.77 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:19<20:36, 304.09 examples/s] +Running tokenizer on dataset: 2%|▏ | 6000/382000 [00:19<20:53, 300.02 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:21<19:32, 319.90 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:21<19:35, 319.13 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:21<19:35, 319.06 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:22<19:46, 316.06 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:22<20:13, 308.92 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:23<20:37, 303.07 examples/s] +Running tokenizer on dataset: 2%|▏ | 7000/382000 [00:23<20:41, 302.14 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:24<19:35, 318.05 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:25<19:34, 318.35 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:25<19:30, 319.51 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:25<20:02, 310.96 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:25<19:56, 312.66 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:26<20:25, 305.16 examples/s] +Running tokenizer on dataset: 2%|▏ | 8000/382000 [00:26<20:44, 300.64 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:28<19:39, 316.36 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:28<19:36, 317.01 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:28<19:27, 319.45 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:28<20:44, 299.70 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:29<19:54, 312.31 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:29<20:23, 304.91 examples/s] +Running tokenizer on dataset: 2%|▏ | 9000/382000 [00:29<20:36, 301.57 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:31<19:48, 312.96 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:31<19:50, 312.51 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:31<19:35, 316.35 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:32<20:35, 301.11 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:32<20:03, 308.97 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:33<20:44, 298.97 examples/s] +Running tokenizer on dataset: 3%|▎ | 10000/382000 [00:33<21:02, 294.56 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:34<19:41, 314.00 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:34<19:22, 319.24 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:34<19:52, 311.04 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:35<20:18, 304.57 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:35<19:55, 310.32 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:36<20:38, 299.64 examples/s] +Running tokenizer on dataset: 3%|▎ | 11000/382000 [00:36<20:40, 298.99 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:37<19:40, 313.52 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:37<19:19, 319.14 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:38<19:51, 310.54 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:38<20:07, 306.35 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:38<20:01, 307.97 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:39<20:45, 297.03 examples/s] +Running tokenizer on dataset: 3%|▎ | 12000/382000 [00:39<20:36, 299.25 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:40<19:01, 323.16 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:40<19:23, 317.24 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:41<19:37, 313.41 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:41<19:47, 310.84 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:41<19:42, 311.96 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:42<20:13, 303.99 examples/s] +Running tokenizer on dataset: 3%|▎ | 13000/382000 [00:43<20:19, 302.60 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:43<18:56, 323.89 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:44<19:24, 316.12 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:44<19:47, 309.80 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:45<19:48, 309.67 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:45<19:53, 308.46 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:46<20:01, 306.38 examples/s] +Running tokenizer on dataset: 4%|▎ | 14000/382000 [00:46<20:17, 302.31 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:46<18:52, 324.19 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:47<19:14, 317.92 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:47<19:46, 309.37 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:48<19:34, 312.53 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:48<19:39, 311.09 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:49<19:50, 308.37 examples/s] +Running tokenizer on dataset: 4%|▍ | 15000/382000 [00:49<20:21, 300.54 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:50<18:51, 323.57 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:50<19:04, 319.87 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:50<19:31, 312.52 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:51<19:25, 314.07 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:51<19:26, 313.76 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:52<19:43, 309.15 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:53<18:38, 326.25 examples/s] +Running tokenizer on dataset: 4%|▍ | 16000/382000 [00:53<20:20, 299.96 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:53<18:56, 321.13 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:53<19:25, 313.16 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:54<19:14, 316.03 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:54<19:07, 317.96 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:55<19:36, 310.12 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:56<18:22, 330.03 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:56<18:46, 323.25 examples/s] +Running tokenizer on dataset: 4%|▍ | 17000/382000 [00:56<20:10, 301.54 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:57<19:10, 316.38 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:57<19:04, 318.04 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:57<19:09, 316.55 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:58<19:18, 314.16 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [00:59<18:29, 327.05 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [00:59<18:47, 321.83 examples/s] +Running tokenizer on dataset: 5%|▍ | 18000/382000 [00:59<19:47, 306.54 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [01:00<19:06, 316.51 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [01:00<19:05, 316.86 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [01:00<19:11, 315.23 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [01:02<19:20, 312.74 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:02<18:26, 327.14 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:02<18:45, 321.66 examples/s] +Running tokenizer on dataset: 5%|▍ | 19000/382000 [01:02<19:58, 302.77 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:03<18:59, 317.75 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:03<19:02, 316.82 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:04<19:29, 309.54 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:05<18:24, 326.71 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:05<19:11, 314.41 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:05<18:42, 321.61 examples/s] +Running tokenizer on dataset: 5%|▌ | 20000/382000 [01:06<19:54, 303.11 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:06<19:04, 315.45 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:06<19:00, 316.64 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:07<19:18, 311.60 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:08<18:27, 325.07 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:08<19:25, 309.71 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:08<18:40, 321.16 examples/s] +Running tokenizer on dataset: 5%|▌ | 21000/382000 [01:09<19:38, 306.31 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:09<19:06, 314.06 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:10<19:01, 315.34 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:10<19:39, 305.33 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:11<18:24, 325.18 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:11<19:19, 310.49 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:12<18:47, 318.42 examples/s] +Running tokenizer on dataset: 6%|▌ | 22000/382000 [01:12<19:50, 302.47 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:13<19:10, 311.94 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:13<19:00, 314.75 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:14<19:40, 304.03 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:14<18:17, 326.33 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:15<19:20, 309.36 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:15<18:51, 316.42 examples/s] +Running tokenizer on dataset: 6%|▌ | 23000/382000 [01:16<19:49, 301.88 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:16<19:11, 310.87 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:16<18:56, 314.95 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:17<19:34, 304.88 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:17<18:19, 324.70 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:18<18:41, 318.27 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:18<19:26, 306.78 examples/s] +Running tokenizer on dataset: 6%|▋ | 24000/382000 [01:19<19:39, 303.43 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:19<19:05, 311.79 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:19<18:51, 315.65 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:20<18:28, 321.25 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:20<19:35, 303.81 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:21<18:47, 315.71 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:21<19:27, 305.83 examples/s] +Running tokenizer on dataset: 7%|▋ | 25000/382000 [01:22<19:30, 305.07 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:22<19:09, 309.69 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:23<19:00, 312.07 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:23<18:23, 321.72 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:24<19:36, 302.50 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:24<18:46, 315.18 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:25<19:42, 300.98 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:25<19:00, 311.31 examples/s] +Running tokenizer on dataset: 7%|▋ | 26000/382000 [01:26<19:45, 300.22 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:26<18:52, 313.37 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:26<18:07, 325.38 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:27<19:39, 300.92 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:27<18:32, 318.11 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:28<19:40, 300.84 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:29<18:42, 315.24 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:29<18:33, 318.02 examples/s] +Running tokenizer on dataset: 7%|▋ | 27000/382000 [01:29<19:32, 302.85 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:30<18:14, 322.47 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:30<19:27, 303.22 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:30<18:30, 317.94 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:31<19:23, 304.30 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:32<18:42, 314.43 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:32<18:33, 316.90 examples/s] +Running tokenizer on dataset: 7%|▋ | 28000/382000 [01:32<19:18, 305.54 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:33<18:18, 320.49 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:34<18:29, 317.25 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:34<19:43, 298.37 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:34<19:21, 303.86 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:35<18:37, 315.10 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:35<18:33, 316.21 examples/s] +Running tokenizer on dataset: 8%|▊ | 29000/382000 [01:35<19:30, 301.62 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:36<18:07, 322.86 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:37<18:12, 321.28 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:37<19:43, 297.52 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:38<19:21, 303.17 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:38<18:30, 316.18 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:38<18:18, 319.47 examples/s] +Running tokenizer on dataset: 8%|▊ | 30000/382000 [01:39<19:28, 301.23 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:39<18:06, 322.24 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:40<18:13, 320.16 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:40<19:24, 301.48 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:41<19:07, 305.86 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:41<18:33, 314.27 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:41<18:20, 318.10 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:42<18:02, 322.28 examples/s] +Running tokenizer on dataset: 8%|▊ | 31000/382000 [01:42<19:24, 301.45 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:43<18:06, 321.14 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:44<19:22, 301.15 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:44<18:10, 320.00 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:44<18:30, 314.27 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:44<19:21, 301.27 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:45<17:57, 322.92 examples/s] +Running tokenizer on dataset: 8%|▊ | 32000/382000 [01:45<19:10, 304.25 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:46<18:06, 320.44 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:47<19:12, 302.76 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:47<18:06, 320.20 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:48<18:26, 314.54 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:48<19:15, 301.91 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:48<17:47, 325.16 examples/s] +Running tokenizer on dataset: 9%|▊ | 33000/382000 [01:49<19:00, 305.93 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:49<17:50, 324.05 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:50<19:18, 300.47 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:51<18:01, 320.71 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:51<18:14, 316.92 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:51<19:15, 301.27 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:51<17:42, 325.54 examples/s] +Running tokenizer on dataset: 9%|▉ | 34000/382000 [01:52<19:01, 304.94 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:52<17:49, 323.65 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:53<18:49, 307.18 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:54<18:01, 319.97 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:54<18:09, 317.54 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [01:54<17:33, 327.42 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:54<18:49, 307.18 examples/s] +Running tokenizer on dataset: 9%|▉ | 35000/382000 [01:55<18:53, 306.23 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [01:55<17:44, 324.05 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:57<18:44, 307.69 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [01:57<17:56, 320.58 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [01:57<18:02, 318.68 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [01:57<17:39, 324.59 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:57<18:54, 304.91 examples/s] +Running tokenizer on dataset: 9%|▉ | 36000/382000 [01:58<18:46, 307.06 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [01:58<17:50, 321.35 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [02:00<18:44, 306.71 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [02:00<17:59, 318.65 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [02:00<18:07, 316.40 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:01<17:56, 318.49 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [02:01<18:39, 308.08 examples/s] +Running tokenizer on dataset: 10%|▉ | 37000/382000 [02:02<18:35, 309.23 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:02<17:55, 319.00 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:03<18:05, 315.88 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [02:03<18:56, 302.67 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:03<18:06, 315.81 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:04<18:01, 316.26 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [02:04<18:34, 308.74 examples/s] +Running tokenizer on dataset: 10%|▉ | 38000/382000 [02:05<18:40, 306.95 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:05<18:04, 315.25 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:07<18:15, 312.17 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:07<18:16, 312.00 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:07<19:01, 300.57 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:07<18:12, 312.19 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:07<18:51, 303.08 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:08<17:53, 317.56 examples/s] +Running tokenizer on dataset: 10%|█ | 39000/382000 [02:08<18:36, 307.07 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:10<18:07, 313.57 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:10<18:10, 312.63 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:10<19:10, 297.18 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:10<18:18, 309.54 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:11<19:00, 299.82 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:11<17:55, 316.28 examples/s] +Running tokenizer on dataset: 10%|█ | 40000/382000 [02:11<18:47, 303.27 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:13<18:08, 312.22 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:13<18:18, 309.40 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:13<18:58, 299.64 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:14<18:07, 311.73 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:14<19:03, 298.25 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:14<17:48, 317.23 examples/s] +Running tokenizer on dataset: 11%|█ | 41000/382000 [02:15<18:54, 300.65 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:16<18:08, 311.39 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:16<18:20, 307.92 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:17<19:16, 294.09 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:17<18:27, 305.14 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:17<18:53, 299.85 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:17<17:51, 315.31 examples/s] +Running tokenizer on dataset: 11%|█ | 42000/382000 [02:18<18:45, 302.05 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:19<18:16, 308.33 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:20<18:15, 308.54 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:20<18:05, 310.46 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:20<19:04, 296.19 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:21<17:46, 315.93 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:21<18:48, 300.27 examples/s] +Running tokenizer on dataset: 11%|█▏ | 43000/382000 [02:22<18:52, 299.21 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:23<18:07, 309.74 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:23<18:04, 310.65 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:23<18:11, 307.74 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:24<19:07, 294.66 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:24<17:45, 315.37 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:24<18:35, 302.90 examples/s] +Running tokenizer on dataset: 12%|█▏ | 44000/382000 [02:25<18:46, 300.08 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:26<18:08, 308.55 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:26<18:08, 308.70 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:27<18:15, 305.92 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:27<17:49, 313.32 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:27<18:58, 296.08 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:27<18:27, 304.34 examples/s] +Running tokenizer on dataset: 12%|█▏ | 45000/382000 [02:28<18:27, 304.34 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:29<18:05, 308.75 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:29<18:07, 308.12 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:30<18:06, 307.53 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:30<17:37, 315.91 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:30<18:55, 295.95 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:31<18:31, 302.33 examples/s] +Running tokenizer on dataset: 12%|█▏ | 46000/382000 [02:31<18:34, 301.53 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:32<17:57, 310.10 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:33<18:02, 308.67 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:33<17:53, 310.07 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:33<17:31, 316.82 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:34<18:39, 299.12 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:34<18:28, 302.31 examples/s] +Running tokenizer on dataset: 12%|█▏ | 47000/382000 [02:35<18:31, 301.27 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:36<17:51, 310.91 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:36<17:50, 311.07 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:36<17:55, 308.62 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:37<17:33, 315.24 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:37<18:30, 300.85 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:37<18:21, 303.22 examples/s] +Running tokenizer on dataset: 13%|█▎ | 48000/382000 [02:38<18:15, 304.82 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:39<17:41, 312.90 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:39<17:45, 311.50 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:39<17:37, 312.95 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:40<17:31, 314.89 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:40<18:20, 302.58 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:40<18:04, 306.97 examples/s] +Running tokenizer on dataset: 13%|█▎ | 49000/382000 [02:41<18:00, 308.16 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:42<17:30, 315.14 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:42<17:35, 313.64 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:43<17:24, 315.90 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:43<17:31, 313.90 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:44<18:12, 303.79 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:44<18:01, 306.98 examples/s] +Running tokenizer on dataset: 13%|█▎ | 50000/382000 [02:45<18:15, 303.01 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:45<17:29, 314.33 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:45<17:32, 313.58 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:46<17:18, 316.79 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:46<17:41, 309.95 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:47<17:43, 311.19 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:47<18:16, 301.75 examples/s] +Running tokenizer on dataset: 13%|█▎ | 51000/382000 [02:48<18:00, 306.21 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:48<17:29, 313.37 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:48<17:29, 313.60 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:49<16:58, 321.92 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:49<17:25, 313.79 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:50<17:43, 310.41 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:50<18:13, 301.88 examples/s] +Running tokenizer on dataset: 14%|█▎ | 52000/382000 [02:51<17:59, 305.73 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:51<17:11, 318.05 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:52<17:16, 316.42 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [02:52<16:52, 323.09 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [02:53<17:22, 313.66 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:53<17:37, 311.03 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:54<18:17, 299.74 examples/s] +Running tokenizer on dataset: 14%|█▍ | 53000/382000 [02:54<17:54, 306.14 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [02:54<17:04, 319.27 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [02:55<17:08, 318.00 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [02:55<16:56, 320.63 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [02:56<17:27, 311.17 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:56<17:21, 314.87 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:57<18:07, 301.61 examples/s] +Running tokenizer on dataset: 14%|█▍ | 54000/382000 [02:57<17:41, 308.92 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [02:58<17:11, 316.01 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [02:58<17:11, 316.01 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [02:58<17:19, 312.51 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [02:59<17:19, 312.70 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [02:59<17:14, 315.97 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [03:00<18:02, 301.99 examples/s] +Running tokenizer on dataset: 14%|█▍ | 55000/382000 [03:01<17:32, 310.73 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [03:01<17:03, 317.42 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [03:01<17:04, 317.09 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:02<17:19, 311.81 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:02<17:13, 313.38 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [03:03<17:14, 315.27 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [03:03<17:51, 304.19 examples/s] +Running tokenizer on dataset: 15%|█▍ | 56000/382000 [03:04<17:33, 309.42 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:04<17:04, 316.27 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:04<17:05, 315.82 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:05<17:08, 314.04 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:05<17:11, 313.24 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [03:06<17:08, 316.02 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [03:07<17:43, 305.53 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:07<16:56, 317.66 examples/s] +Running tokenizer on dataset: 15%|█▍ | 57000/382000 [03:07<17:35, 307.98 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:07<17:04, 315.28 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:08<17:02, 315.05 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:09<17:07, 313.40 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:09<17:05, 315.93 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:10<17:52, 302.09 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:10<16:55, 316.96 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:10<16:56, 316.76 examples/s] +Running tokenizer on dataset: 15%|█▌ | 58000/382000 [03:11<17:41, 305.23 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:11<16:58, 315.28 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:12<17:04, 313.39 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:12<17:00, 316.53 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:13<17:47, 302.46 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:13<16:58, 315.06 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:14<16:55, 316.12 examples/s] +Running tokenizer on dataset: 15%|█▌ | 59000/382000 [03:14<17:31, 307.28 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:14<16:50, 316.55 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:15<16:50, 316.62 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:15<16:54, 317.28 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:17<16:50, 316.63 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:17<16:47, 317.60 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:17<18:00, 298.04 examples/s] +Running tokenizer on dataset: 16%|█▌ | 60000/382000 [03:17<17:26, 307.78 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:17<16:40, 318.85 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:18<16:44, 317.41 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:18<16:53, 316.69 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:20<16:45, 317.25 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:20<16:39, 319.00 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:20<17:56, 298.08 examples/s] +Running tokenizer on dataset: 16%|█▌ | 61000/382000 [03:20<17:34, 304.47 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:20<16:38, 318.53 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:21<16:45, 316.30 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:21<16:40, 319.98 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:23<16:49, 315.01 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:23<16:45, 316.16 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:23<16:31, 319.78 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:23<17:14, 309.18 examples/s] +Running tokenizer on dataset: 16%|█▌ | 62000/382000 [03:23<17:48, 299.53 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:24<16:38, 317.51 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:24<16:35, 320.48 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:26<16:42, 316.36 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:26<16:36, 318.22 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:27<16:30, 319.03 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:27<17:09, 309.98 examples/s] +Running tokenizer on dataset: 16%|█▋ | 63000/382000 [03:27<17:40, 300.88 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:27<16:37, 316.79 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:28<16:39, 318.19 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:29<16:41, 315.52 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:29<16:38, 316.48 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:30<16:27, 318.84 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:30<17:03, 310.55 examples/s] +Running tokenizer on dataset: 17%|█▋ | 64000/382000 [03:30<17:37, 300.69 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:31<16:37, 315.91 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:31<16:33, 319.16 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:32<16:41, 314.42 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:33<16:39, 315.12 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:33<16:24, 319.09 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:33<17:00, 310.60 examples/s] +Running tokenizer on dataset: 17%|█▋ | 65000/382000 [03:33<17:38, 299.37 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:34<16:37, 314.81 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:34<16:32, 318.29 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:36<16:42, 313.34 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:36<16:37, 314.70 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:36<16:18, 319.88 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:36<17:02, 308.92 examples/s] +Running tokenizer on dataset: 17%|█▋ | 66000/382000 [03:37<17:33, 299.87 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:37<16:26, 317.33 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:37<16:35, 316.37 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:39<16:35, 314.34 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:39<16:35, 314.56 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:39<16:51, 308.37 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:40<17:08, 306.18 examples/s] +Running tokenizer on dataset: 18%|█▊ | 67000/382000 [03:40<17:27, 300.73 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:40<16:27, 316.05 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:40<16:35, 315.34 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:42<16:43, 310.78 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:42<16:34, 313.80 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:43<16:46, 308.84 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:43<17:11, 304.51 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:43<16:13, 319.39 examples/s] +Running tokenizer on dataset: 18%|█▊ | 68000/382000 [03:43<17:18, 302.26 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:43<16:27, 316.82 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:45<16:18, 317.70 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:45<16:34, 312.83 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:46<16:40, 309.80 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:46<16:56, 308.05 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:46<16:10, 319.27 examples/s] +Running tokenizer on dataset: 18%|█▊ | 69000/382000 [03:47<17:13, 302.91 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:47<16:31, 314.56 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:48<16:17, 317.22 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:48<16:25, 314.63 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [03:49<16:32, 311.41 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [03:49<16:01, 321.51 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:49<16:46, 310.01 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:50<16:17, 318.02 examples/s] +Running tokenizer on dataset: 18%|█▊ | 70000/382000 [03:50<17:00, 305.71 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [03:52<16:09, 318.73 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [03:52<16:17, 316.04 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [03:52<16:35, 309.35 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [03:52<15:58, 321.41 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:53<16:38, 311.44 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:53<16:11, 319.16 examples/s] +Running tokenizer on dataset: 19%|█▊ | 71000/382000 [03:53<16:50, 307.76 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [03:55<16:07, 318.42 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [03:55<16:14, 316.22 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [03:55<16:22, 312.56 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [03:56<15:56, 320.89 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:56<16:32, 312.21 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [03:56<15:58, 322.41 examples/s] +Running tokenizer on dataset: 19%|█▉ | 72000/382000 [03:56<16:52, 306.18 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [03:58<16:02, 319.11 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [03:58<16:23, 312.24 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [03:59<16:26, 310.12 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [03:59<16:07, 316.27 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [03:59<16:32, 311.45 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [03:59<15:54, 322.65 examples/s] +Running tokenizer on dataset: 19%|█▉ | 73000/382000 [04:00<16:46, 307.12 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [04:01<16:08, 315.87 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [04:01<16:23, 311.06 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:02<16:25, 309.61 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:02<16:11, 313.87 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [04:02<15:52, 322.28 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [04:02<16:33, 310.04 examples/s] +Running tokenizer on dataset: 19%|█▉ | 74000/382000 [04:03<16:43, 307.05 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:04<16:10, 314.22 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:04<16:23, 309.97 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:05<16:12, 312.58 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:05<16:05, 314.86 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [04:05<16:00, 318.61 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [04:05<16:23, 312.09 examples/s] +Running tokenizer on dataset: 20%|█▉ | 75000/382000 [04:06<16:54, 302.65 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:07<16:01, 316.01 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:08<16:11, 312.76 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:08<16:07, 313.19 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:08<15:56, 316.88 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:09<16:03, 316.64 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [04:09<16:33, 308.09 examples/s] +Running tokenizer on dataset: 20%|█▉ | 76000/382000 [04:10<16:55, 301.28 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:10<15:54, 317.49 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:11<16:03, 314.59 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:11<16:04, 313.20 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:12<16:01, 314.24 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:12<16:02, 315.75 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:12<16:22, 310.32 examples/s] +Running tokenizer on dataset: 20%|██ | 77000/382000 [04:13<17:15, 294.55 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:14<15:58, 315.02 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:14<16:08, 311.97 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:15<16:00, 313.45 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:15<16:00, 313.29 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:15<15:53, 317.83 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:15<16:12, 312.58 examples/s] +Running tokenizer on dataset: 20%|██ | 78000/382000 [04:17<17:13, 294.22 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:17<15:54, 315.24 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:17<15:59, 313.82 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:18<16:03, 311.37 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:18<15:56, 313.68 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:18<15:58, 314.91 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:18<16:01, 315.00 examples/s] +Running tokenizer on dataset: 21%|██ | 79000/382000 [04:20<16:55, 298.28 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:20<15:55, 314.11 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:20<15:56, 313.51 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:21<15:44, 316.70 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:21<15:51, 314.13 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:21<16:00, 313.46 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:21<16:12, 310.62 examples/s] +Running tokenizer on dataset: 21%|██ | 80000/382000 [04:23<16:49, 299.10 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:23<15:45, 316.18 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:23<15:43, 317.05 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:24<15:34, 318.96 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:24<15:46, 314.79 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:24<15:50, 315.60 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:25<16:16, 308.14 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:26<15:37, 317.98 examples/s] +Running tokenizer on dataset: 21%|██ | 81000/382000 [04:26<16:42, 300.11 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:27<15:35, 318.50 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:27<15:31, 318.81 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:27<15:41, 315.49 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:28<15:41, 317.41 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:28<16:07, 309.95 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:29<15:34, 317.92 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:30<15:34, 317.81 examples/s] +Running tokenizer on dataset: 21%|██▏ | 82000/382000 [04:30<16:36, 301.15 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:30<15:30, 317.97 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:31<15:40, 316.94 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:31<15:57, 309.04 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:31<15:57, 312.31 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:33<15:36, 316.19 examples/s] +Running tokenizer on dataset: 22%|██▏ | 83000/382000 [04:33<16:17, 305.79 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:33<15:34, 316.78 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:34<15:28, 317.75 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:34<15:37, 316.85 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:34<15:51, 309.98 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:34<16:06, 308.43 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:36<15:45, 312.07 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:36<15:27, 318.16 examples/s] +Running tokenizer on dataset: 22%|██▏ | 84000/382000 [04:36<16:11, 306.90 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:37<15:12, 322.15 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:37<15:31, 315.53 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:37<15:35, 316.36 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:38<16:15, 304.44 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:39<15:31, 315.69 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:39<15:20, 319.47 examples/s] +Running tokenizer on dataset: 22%|██▏ | 85000/382000 [04:39<15:52, 311.82 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:40<15:11, 321.54 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:40<15:34, 315.79 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:40<15:40, 311.64 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:41<16:18, 302.55 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:42<15:31, 314.54 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:42<15:20, 318.45 examples/s] +Running tokenizer on dataset: 23%|██▎ | 86000/382000 [04:42<15:51, 310.93 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:43<15:15, 318.87 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:43<15:28, 316.53 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:44<15:38, 311.27 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:44<16:16, 302.13 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:45<15:29, 314.09 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:46<15:28, 314.57 examples/s] +Running tokenizer on dataset: 23%|██▎ | 87000/382000 [04:46<15:47, 311.22 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:46<15:09, 319.92 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:46<15:23, 317.37 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:47<15:51, 305.87 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:47<15:45, 310.85 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:49<15:21, 315.65 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:49<15:19, 316.56 examples/s] +Running tokenizer on dataset: 23%|██▎ | 88000/382000 [04:49<15:35, 314.21 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [04:49<15:10, 318.61 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:50<15:28, 314.60 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [04:50<15:49, 305.57 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:51<15:47, 309.32 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [04:52<15:19, 315.38 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [04:52<15:18, 315.57 examples/s] +Running tokenizer on dataset: 23%|██▎ | 89000/382000 [04:52<15:47, 309.32 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [04:52<15:04, 319.61 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:53<15:18, 316.78 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [04:53<15:40, 307.31 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:54<15:56, 305.24 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [04:55<15:09, 317.70 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [04:55<15:10, 317.49 examples/s] +Running tokenizer on dataset: 24%|██▎ | 90000/382000 [04:55<15:42, 309.70 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [04:55<15:03, 318.78 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [04:56<15:11, 318.16 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [04:57<15:39, 306.52 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:57<15:44, 308.03 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [04:58<15:13, 315.27 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [04:58<15:10, 316.24 examples/s] +Running tokenizer on dataset: 24%|██▍ | 91000/382000 [04:59<15:38, 310.13 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [04:59<15:05, 316.87 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [04:59<15:09, 317.80 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [05:00<15:32, 307.87 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [05:01<15:53, 304.04 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [05:01<15:11, 314.77 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [05:01<15:07, 316.23 examples/s] +Running tokenizer on dataset: 24%|██▍ | 92000/382000 [05:02<15:43, 307.38 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:02<15:08, 314.66 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [05:02<15:10, 316.25 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:03<15:32, 306.79 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [05:04<15:46, 305.20 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:05<15:15, 312.23 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:05<15:14, 312.87 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:05<15:06, 314.40 examples/s] +Running tokenizer on dataset: 24%|██▍ | 93000/382000 [05:05<15:41, 306.90 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [05:05<15:05, 316.84 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:06<15:20, 309.61 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [05:07<15:47, 303.84 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:08<15:16, 311.08 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:08<15:10, 313.10 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:08<14:58, 316.15 examples/s] +Running tokenizer on dataset: 25%|██▍ | 94000/382000 [05:08<15:45, 304.53 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:09<15:05, 315.71 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:10<15:10, 311.86 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [05:11<15:43, 304.08 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:11<15:01, 314.88 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:11<15:15, 310.18 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:11<14:52, 317.00 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:12<15:04, 314.95 examples/s] +Running tokenizer on dataset: 25%|██▍ | 95000/382000 [05:12<15:53, 300.90 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:13<15:04, 312.75 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:14<15:45, 302.58 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:14<14:54, 316.33 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:14<15:17, 308.42 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:14<14:45, 318.50 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:15<14:56, 316.68 examples/s] +Running tokenizer on dataset: 25%|██▌ | 96000/382000 [05:15<15:56, 299.10 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:16<15:01, 312.75 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:17<14:48, 317.52 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:17<15:40, 302.91 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:17<15:04, 311.70 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:18<14:37, 320.38 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:18<14:50, 317.73 examples/s] +Running tokenizer on dataset: 25%|██▌ | 97000/382000 [05:19<15:53, 299.02 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:19<14:55, 313.62 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:20<14:45, 317.50 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:20<15:37, 302.88 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:21<14:43, 316.99 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:21<15:13, 307.58 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:21<14:49, 317.19 examples/s] +Running tokenizer on dataset: 26%|██▌ | 98000/382000 [05:22<15:45, 300.29 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:22<14:57, 311.91 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:23<14:46, 315.90 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:24<15:21, 307.14 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:24<14:47, 314.22 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:24<15:10, 307.51 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:24<14:38, 319.85 examples/s] +Running tokenizer on dataset: 26%|██▌ | 99000/382000 [05:25<15:35, 302.39 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:26<14:59, 310.13 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:27<14:54, 312.06 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:27<15:13, 308.56 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:27<14:38, 316.53 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:27<15:11, 305.97 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:27<14:42, 317.38 examples/s] +Running tokenizer on dataset: 26%|██▌ | 100000/382000 [05:28<15:24, 304.97 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:29<14:47, 313.39 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:30<14:49, 312.42 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:30<15:14, 307.20 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:30<14:28, 319.04 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:31<15:06, 306.64 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:31<14:47, 314.38 examples/s] +Running tokenizer on dataset: 26%|██▋ | 101000/382000 [05:32<15:08, 309.20 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:32<14:36, 316.10 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:33<14:41, 314.26 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:33<14:19, 321.10 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:33<15:13, 306.50 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:34<14:57, 308.70 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:34<14:43, 314.71 examples/s] +Running tokenizer on dataset: 27%|██▋ | 102000/382000 [05:35<15:12, 306.68 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:35<14:30, 317.05 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:36<14:28, 317.67 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:37<15:08, 307.14 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:37<14:44, 310.89 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:37<14:50, 309.90 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:37<14:34, 316.80 examples/s] +Running tokenizer on dataset: 27%|██▋ | 103000/382000 [05:38<15:17, 304.07 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:39<15:05, 303.77 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:40<14:52, 308.25 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:40<14:29, 315.18 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:40<14:55, 310.42 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:40<14:25, 318.73 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:40<15:07, 302.99 examples/s] +Running tokenizer on dataset: 27%|██▋ | 104000/382000 [05:41<15:13, 304.23 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:42<14:50, 307.85 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:43<14:47, 308.81 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:43<14:50, 310.94 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:43<14:31, 313.12 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:44<14:54, 306.15 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:44<14:53, 307.81 examples/s] +Running tokenizer on dataset: 27%|██▋ | 105000/382000 [05:45<15:08, 305.03 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:45<14:42, 309.30 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [05:46<14:17, 317.18 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:46<14:39, 313.78 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:46<14:45, 308.31 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:47<14:37, 312.38 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:47<14:46, 307.90 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [05:48<14:27, 313.62 examples/s] +Running tokenizer on dataset: 28%|██▊ | 106000/382000 [05:48<15:04, 305.18 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [05:49<14:27, 313.72 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [05:49<14:26, 312.69 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:50<14:56, 306.81 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:50<14:28, 314.49 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [05:50<14:32, 311.91 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [05:51<14:39, 308.18 examples/s] +Running tokenizer on dataset: 28%|██▊ | 107000/382000 [05:52<15:25, 297.20 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [05:53<14:33, 310.23 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [05:53<14:19, 314.12 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:53<14:45, 309.40 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [05:53<14:18, 316.99 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [05:53<14:33, 310.14 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [05:55<14:41, 306.14 examples/s] +Running tokenizer on dataset: 28%|██▊ | 108000/382000 [05:55<15:07, 301.97 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [05:56<14:24, 312.18 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [05:56<14:15, 314.33 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:56<14:34, 312.01 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [05:56<14:28, 311.96 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [05:56<14:27, 311.17 examples/s] +Running tokenizer on dataset: 29%|██▊ | 109000/382000 [05:58<14:50, 306.52 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [05:58<14:54, 300.59 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [05:59<14:17, 313.72 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [05:59<14:12, 314.33 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [05:59<14:24, 314.70 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [06:00<14:20, 312.78 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [06:00<14:34, 308.70 examples/s] +Running tokenizer on dataset: 29%|██▉ | 110000/382000 [06:01<14:39, 309.11 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [06:01<14:45, 302.63 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:02<14:06, 315.58 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [06:02<14:19, 311.89 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [06:02<14:28, 312.14 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [06:03<14:19, 311.64 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [06:03<14:48, 302.84 examples/s] +Running tokenizer on dataset: 29%|██▉ | 111000/382000 [06:04<14:46, 305.78 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:05<14:46, 301.04 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:05<13:56, 318.09 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:05<14:07, 315.19 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [06:05<14:31, 309.72 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:06<14:10, 313.77 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [06:06<14:50, 301.01 examples/s] +Running tokenizer on dataset: 29%|██▉ | 112000/382000 [06:08<14:43, 305.63 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:08<14:35, 303.66 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:08<13:53, 317.79 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:08<13:59, 316.82 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [06:09<14:30, 309.05 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:09<14:05, 314.72 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:10<14:36, 304.60 examples/s] +Running tokenizer on dataset: 30%|██▉ | 113000/382000 [06:11<14:42, 304.67 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:11<14:28, 305.10 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:11<13:45, 319.73 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:11<14:01, 314.94 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [06:12<14:32, 307.12 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:12<14:04, 313.96 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:13<14:29, 305.96 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:14<14:16, 308.30 examples/s] +Running tokenizer on dataset: 30%|██▉ | 114000/382000 [06:14<14:49, 301.23 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:15<13:46, 318.22 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:15<13:53, 316.74 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:15<14:29, 306.98 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:15<13:54, 316.19 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:16<14:28, 304.96 examples/s] +Running tokenizer on dataset: 30%|███ | 115000/382000 [06:18<14:36, 304.69 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:18<13:40, 319.12 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:18<14:15, 307.43 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:18<13:50, 316.76 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:19<14:23, 308.09 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:19<13:54, 314.97 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:19<14:18, 307.41 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:21<13:41, 317.59 examples/s] +Running tokenizer on dataset: 30%|███ | 116000/382000 [06:21<14:27, 306.49 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:21<13:43, 318.00 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:21<14:13, 307.06 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:22<13:50, 315.42 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:22<14:18, 308.55 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:23<14:30, 302.21 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:24<13:42, 316.09 examples/s] +Running tokenizer on dataset: 31%|███ | 117000/382000 [06:24<14:20, 307.88 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:24<13:46, 315.76 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:24<14:12, 306.24 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:25<14:08, 311.28 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:25<13:50, 314.18 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:26<14:17, 305.44 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:27<13:39, 316.23 examples/s] +Running tokenizer on dataset: 31%|███ | 118000/382000 [06:27<14:07, 311.54 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:27<13:44, 315.26 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:28<14:14, 304.10 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:28<14:05, 310.93 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:28<13:48, 313.77 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:29<14:19, 303.50 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:30<13:32, 317.45 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:30<13:42, 314.78 examples/s] +Running tokenizer on dataset: 31%|███ | 119000/382000 [06:31<14:18, 306.27 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:31<14:08, 305.30 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:31<14:01, 311.42 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:31<13:49, 312.40 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:33<14:13, 304.80 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:33<13:28, 318.04 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:34<13:37, 315.51 examples/s] +Running tokenizer on dataset: 31%|███▏ | 120000/382000 [06:34<14:13, 307.00 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:34<14:03, 306.05 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:35<13:43, 313.29 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:35<14:14, 305.32 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:36<14:23, 300.01 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:37<13:23, 318.78 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:37<13:28, 318.03 examples/s] +Running tokenizer on dataset: 32%|███▏ | 121000/382000 [06:37<14:17, 304.38 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:37<13:52, 308.83 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:38<13:35, 315.30 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:38<14:07, 306.68 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:39<14:15, 301.45 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:40<13:20, 318.38 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:40<13:27, 317.05 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:41<13:55, 306.22 examples/s] +Running tokenizer on dataset: 32%|███▏ | 122000/382000 [06:41<14:26, 300.02 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:41<13:31, 315.36 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:41<14:09, 304.89 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:43<14:20, 298.77 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [06:43<13:20, 317.38 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:43<13:26, 316.36 examples/s] +Running tokenizer on dataset: 32%|███▏ | 123000/382000 [06:44<14:18, 301.59 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:44<14:01, 303.10 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:44<13:31, 314.39 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:44<13:57, 308.18 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:46<14:04, 302.97 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [06:46<13:18, 318.14 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [06:46<13:32, 311.57 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [06:47<13:48, 306.75 examples/s] +Running tokenizer on dataset: 32%|███▏ | 124000/382000 [06:47<14:12, 302.73 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [06:47<13:22, 316.39 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:48<13:44, 311.65 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:49<14:02, 302.65 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [06:49<13:16, 317.47 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [06:49<13:30, 310.81 examples/s] +Running tokenizer on dataset: 33%|███▎ | 125000/382000 [06:50<13:57, 306.98 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [06:50<13:43, 307.08 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [06:50<13:24, 314.61 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:51<13:47, 309.41 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [06:52<13:12, 317.90 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [06:53<13:24, 311.84 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [06:53<14:09, 298.91 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [06:54<13:35, 308.88 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [06:54<13:19, 315.19 examples/s] +Running tokenizer on dataset: 33%|███▎ | 126000/382000 [06:54<13:57, 305.79 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:54<13:41, 310.48 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [06:56<13:07, 318.89 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [06:56<13:23, 311.20 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [06:56<14:02, 300.26 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [06:57<13:23, 312.22 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [06:57<13:10, 317.51 examples/s] +Running tokenizer on dataset: 33%|███▎ | 127000/382000 [06:57<13:56, 304.67 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [06:57<13:43, 308.25 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [06:59<13:06, 317.75 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [06:59<13:09, 315.22 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [06:59<13:50, 303.43 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [07:00<13:12, 315.60 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [07:00<13:25, 310.26 examples/s] +Running tokenizer on dataset: 34%|███▎ | 128000/382000 [07:00<13:43, 308.47 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [07:01<13:48, 305.49 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [07:02<13:02, 318.09 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:02<13:10, 313.78 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [07:02<13:34, 307.98 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [07:03<13:12, 314.19 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [07:03<13:07, 316.21 examples/s] +Running tokenizer on dataset: 34%|███▍ | 129000/382000 [07:03<13:51, 304.21 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [07:04<13:38, 307.81 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:05<13:00, 317.73 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:05<13:03, 315.26 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [07:06<13:46, 302.44 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:06<13:08, 314.50 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:06<13:07, 314.73 examples/s] +Running tokenizer on dataset: 34%|███▍ | 130000/382000 [07:07<13:37, 308.20 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [07:07<13:28, 310.63 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:08<12:54, 319.09 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:08<12:58, 315.97 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [07:09<13:39, 303.94 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:09<13:02, 315.47 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:09<13:02, 315.56 examples/s] +Running tokenizer on dataset: 34%|███▍ | 131000/382000 [07:10<13:35, 307.80 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [07:10<13:35, 306.72 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:11<12:48, 320.10 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:12<13:05, 312.02 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:12<13:42, 301.52 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:12<12:56, 316.80 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:13<13:02, 314.30 examples/s] +Running tokenizer on dataset: 35%|███▍ | 132000/382000 [07:13<13:42, 303.87 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [07:14<13:29, 307.75 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:14<12:49, 318.43 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:15<13:03, 311.46 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:16<13:30, 304.89 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:16<12:53, 316.74 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:16<12:58, 314.89 examples/s] +Running tokenizer on dataset: 35%|███▍ | 133000/382000 [07:16<13:32, 306.50 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:17<13:32, 305.15 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:18<12:51, 316.40 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:18<13:05, 309.41 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:19<13:14, 309.58 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:19<12:54, 315.16 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:19<12:59, 313.06 examples/s] +Running tokenizer on dataset: 35%|███▌ | 134000/382000 [07:20<13:40, 302.22 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:20<13:32, 303.82 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:21<12:52, 314.72 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:21<12:56, 311.76 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:22<12:49, 315.88 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:22<12:51, 315.01 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:22<13:26, 303.83 examples/s] +Running tokenizer on dataset: 35%|███▌ | 135000/382000 [07:23<13:35, 302.88 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:24<13:27, 304.48 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:24<12:53, 312.79 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:25<12:56, 310.51 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:25<12:46, 315.88 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:25<12:51, 313.55 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:25<13:27, 302.33 examples/s] +Running tokenizer on dataset: 36%|███▌ | 136000/382000 [07:26<13:20, 307.33 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:27<13:19, 306.39 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:27<12:45, 314.63 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:28<12:47, 312.66 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:28<12:42, 316.23 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:28<12:46, 314.28 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:29<13:28, 300.69 examples/s] +Running tokenizer on dataset: 36%|███▌ | 137000/382000 [07:29<13:12, 309.06 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:30<13:30, 301.04 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:30<12:44, 313.87 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:31<12:54, 308.39 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:32<12:43, 314.35 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:32<12:44, 314.07 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:32<13:09, 306.68 examples/s] +Running tokenizer on dataset: 36%|███▌ | 138000/382000 [07:33<13:21, 304.43 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:33<13:16, 305.09 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:34<12:44, 312.55 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:34<12:45, 311.04 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:35<12:43, 313.05 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:35<12:44, 312.74 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:35<13:14, 303.22 examples/s] +Running tokenizer on dataset: 36%|███▋ | 139000/382000 [07:36<13:08, 308.33 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:37<13:07, 307.30 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:37<12:34, 315.28 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:38<12:42, 310.78 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:38<12:37, 314.34 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:38<12:37, 314.38 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:39<13:10, 303.56 examples/s] +Running tokenizer on dataset: 37%|███▋ | 140000/382000 [07:39<13:02, 309.07 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:40<12:28, 316.56 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:40<13:06, 306.50 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:41<12:44, 308.75 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:41<12:32, 315.10 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:41<12:30, 315.78 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:42<13:23, 297.32 examples/s] +Running tokenizer on dataset: 37%|███▋ | 141000/382000 [07:42<12:52, 312.15 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:43<12:33, 313.13 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:43<13:07, 304.94 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [07:44<12:37, 310.29 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:44<12:35, 312.43 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:44<12:34, 312.88 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:45<13:14, 299.70 examples/s] +Running tokenizer on dataset: 37%|███▋ | 142000/382000 [07:46<12:46, 313.15 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [07:46<12:27, 314.57 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:46<13:02, 305.52 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [07:47<12:24, 314.41 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [07:48<12:29, 313.48 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [07:48<12:26, 314.76 examples/s] +Running tokenizer on dataset: 37%|███▋ | 143000/382000 [07:49<12:38, 315.05 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:49<13:16, 297.66 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [07:49<12:18, 317.02 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:50<12:55, 306.74 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [07:50<12:26, 312.24 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [07:51<12:22, 315.34 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [07:51<12:19, 316.34 examples/s] +Running tokenizer on dataset: 38%|███▊ | 144000/382000 [07:52<12:37, 313.99 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:52<13:12, 297.63 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [07:53<12:17, 315.84 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:53<12:59, 304.21 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [07:53<12:16, 315.08 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [07:54<12:21, 314.33 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [07:54<12:25, 312.57 examples/s] +Running tokenizer on dataset: 38%|███▊ | 145000/382000 [07:55<12:38, 312.46 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [07:55<12:59, 301.30 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [07:56<12:14, 316.00 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:57<13:11, 298.24 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [07:57<12:13, 314.80 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [07:57<12:08, 318.45 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [07:57<12:12, 316.58 examples/s] +Running tokenizer on dataset: 38%|███▊ | 146000/382000 [07:58<12:44, 308.81 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [07:59<12:43, 306.31 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [07:59<12:15, 314.20 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [08:00<13:01, 300.80 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:00<12:17, 311.92 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [08:00<12:06, 317.86 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [08:00<12:12, 315.41 examples/s] +Running tokenizer on dataset: 38%|███▊ | 147000/382000 [08:02<12:38, 309.97 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [08:02<12:41, 306.00 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:02<12:14, 313.28 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [08:03<12:50, 303.71 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:03<12:03, 317.70 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:03<12:26, 306.85 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:03<12:09, 315.29 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [08:05<12:23, 311.97 examples/s] +Running tokenizer on dataset: 39%|███▊ | 148000/382000 [08:05<12:35, 309.63 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:05<12:11, 312.88 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:06<12:16, 309.54 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [08:06<12:54, 300.88 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:06<12:04, 315.89 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:07<12:08, 314.30 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [08:08<12:18, 312.93 examples/s] +Running tokenizer on dataset: 39%|███▉ | 149000/382000 [08:08<12:38, 307.33 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:09<12:06, 313.87 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [08:09<12:31, 308.58 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:10<12:03, 313.65 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:10<11:57, 317.66 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:10<11:59, 316.90 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:11<12:18, 311.32 examples/s] +Running tokenizer on dataset: 39%|███▉ | 150000/382000 [08:11<12:32, 308.29 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:12<12:06, 312.28 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:13<11:49, 319.79 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:13<11:59, 314.27 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [08:13<12:27, 308.88 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:13<11:52, 318.50 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:15<12:21, 308.99 examples/s] +Running tokenizer on dataset: 40%|███▉ | 151000/382000 [08:15<12:27, 309.23 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:15<12:07, 310.75 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:16<11:39, 321.68 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:16<11:44, 320.80 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:16<12:15, 312.60 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:16<11:48, 319.05 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:18<12:04, 314.50 examples/s] +Running tokenizer on dataset: 40%|███▉ | 152000/382000 [08:18<12:33, 305.20 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:18<11:53, 315.25 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:19<11:29, 326.24 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:19<11:43, 318.38 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:19<11:34, 323.91 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:19<12:10, 313.65 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:21<12:10, 310.87 examples/s] +Running tokenizer on dataset: 40%|████ | 153000/382000 [08:21<12:24, 307.43 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:21<11:58, 311.78 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:22<11:34, 322.65 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:22<11:39, 320.39 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:22<11:47, 314.99 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:22<12:11, 311.57 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:24<12:02, 312.99 examples/s] +Running tokenizer on dataset: 40%|████ | 154000/382000 [08:25<12:27, 304.89 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:25<11:57, 310.91 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:25<11:37, 319.78 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:25<11:39, 318.61 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:25<11:47, 313.59 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:25<12:00, 315.00 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:27<11:50, 316.87 examples/s] +Running tokenizer on dataset: 41%|████ | 155000/382000 [08:28<12:17, 307.64 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:28<11:47, 313.76 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:28<11:33, 319.97 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:28<11:34, 319.45 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:28<11:55, 315.98 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:29<11:51, 310.81 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:30<11:45, 317.57 examples/s] +Running tokenizer on dataset: 41%|████ | 156000/382000 [08:31<12:09, 309.69 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:31<11:45, 313.45 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:31<11:30, 319.94 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:32<11:34, 318.37 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:32<11:48, 317.38 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:32<11:40, 314.22 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:33<11:46, 315.68 examples/s] +Running tokenizer on dataset: 41%|████ | 157000/382000 [08:34<11:55, 314.64 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:34<11:47, 311.09 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:34<11:20, 323.45 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:35<11:26, 320.57 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:35<11:47, 316.42 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:35<11:47, 309.32 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:37<11:40, 316.90 examples/s] +Running tokenizer on dataset: 41%|████▏ | 158000/382000 [08:37<11:50, 315.46 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:37<11:21, 321.41 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:38<11:57, 305.26 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:38<11:27, 318.52 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:38<11:41, 317.83 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:38<11:40, 311.02 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:40<11:38, 316.29 examples/s] +Running tokenizer on dataset: 42%|████▏ | 159000/382000 [08:40<11:54, 312.15 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:41<11:21, 319.77 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:41<11:26, 317.57 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:41<11:56, 304.10 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:41<11:37, 318.27 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [08:42<11:45, 307.49 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:43<11:31, 318.30 examples/s] +Running tokenizer on dataset: 42%|████▏ | 160000/382000 [08:44<11:51, 312.09 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [08:44<11:23, 317.49 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [08:44<11:24, 316.83 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [08:44<11:51, 305.20 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:44<11:45, 313.25 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [08:45<11:36, 310.13 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:46<11:36, 314.53 examples/s] +Running tokenizer on dataset: 42%|████▏ | 161000/382000 [08:47<11:39, 316.01 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [08:47<11:13, 320.48 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [08:47<11:17, 318.61 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [08:47<11:35, 310.36 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:47<11:32, 317.74 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [08:48<11:40, 307.06 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:49<11:39, 311.48 examples/s] +Running tokenizer on dataset: 42%|████▏ | 162000/382000 [08:50<11:28, 319.36 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [08:50<11:11, 320.06 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [08:50<11:17, 317.36 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [08:51<11:30, 311.20 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:51<11:32, 316.05 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [08:51<11:30, 310.01 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [08:53<11:37, 311.11 examples/s] +Running tokenizer on dataset: 43%|████▎ | 163000/382000 [08:53<11:30, 317.05 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [08:53<11:05, 321.72 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [08:53<11:11, 318.53 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [08:54<11:19, 314.96 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:54<11:26, 317.75 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [08:55<11:33, 307.27 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [08:56<11:26, 314.47 examples/s] +Running tokenizer on dataset: 43%|████▎ | 164000/382000 [08:56<11:43, 310.01 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [08:56<11:12, 316.57 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [08:57<11:23, 311.84 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [08:57<11:26, 310.16 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [08:57<11:33, 313.12 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [08:58<11:24, 309.73 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [08:59<11:32, 310.62 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [08:59<11:04, 319.01 examples/s] +Running tokenizer on dataset: 43%|████▎ | 165000/382000 [09:00<11:38, 310.57 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [09:00<11:14, 314.43 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [09:00<11:21, 316.87 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [09:00<11:20, 311.32 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:01<11:22, 309.31 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [09:02<11:23, 313.00 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:03<11:06, 316.39 examples/s] +Running tokenizer on dataset: 43%|████▎ | 166000/382000 [09:03<11:37, 309.61 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [09:03<11:17, 317.24 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:03<11:16, 311.99 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:03<11:19, 310.37 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:04<11:19, 309.19 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [09:06<11:33, 307.27 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:06<11:04, 316.23 examples/s] +Running tokenizer on dataset: 44%|████▎ | 167000/382000 [09:06<11:40, 307.01 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [09:06<11:10, 319.24 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:07<11:17, 309.96 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:07<11:17, 309.76 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:07<11:15, 309.51 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [09:09<11:24, 309.54 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:09<11:01, 316.18 examples/s] +Running tokenizer on dataset: 44%|████▍ | 168000/382000 [09:09<11:27, 311.19 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [09:10<11:13, 316.45 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:10<11:12, 310.91 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:10<11:16, 309.01 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:11<11:05, 312.45 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:12<11:22, 309.16 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:12<10:58, 315.71 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [09:13<11:00, 321.14 examples/s] +Running tokenizer on dataset: 44%|████▍ | 169000/382000 [09:13<11:40, 303.88 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:13<11:09, 310.66 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:13<11:10, 310.08 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:14<11:03, 312.15 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:15<11:23, 307.15 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:15<10:54, 316.06 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:16<11:01, 319.10 examples/s] +Running tokenizer on dataset: 45%|████▍ | 170000/382000 [09:16<11:31, 306.60 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:16<11:02, 312.44 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:16<11:09, 309.10 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:17<11:08, 308.31 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:19<10:55, 314.33 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:19<11:19, 307.54 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:19<10:56, 320.04 examples/s] +Running tokenizer on dataset: 45%|████▍ | 171000/382000 [09:19<11:37, 302.59 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:19<10:58, 313.00 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:20<11:07, 308.71 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:20<10:54, 313.05 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:22<10:44, 318.03 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:22<11:24, 304.05 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:22<10:54, 319.40 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:22<10:48, 316.10 examples/s] +Running tokenizer on dataset: 45%|████▌ | 172000/382000 [09:23<11:29, 304.35 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:23<11:01, 309.97 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:24<11:01, 308.28 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:25<10:49, 314.28 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:25<10:49, 320.33 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:25<11:25, 301.81 examples/s] +Running tokenizer on dataset: 45%|████▌ | 173000/382000 [09:26<11:18, 307.84 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:26<10:54, 311.92 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:26<11:03, 307.42 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:27<10:52, 311.25 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:28<10:42, 316.06 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:28<10:48, 319.09 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:29<11:19, 303.31 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:29<10:47, 313.70 examples/s] +Running tokenizer on dataset: 46%|████▌ | 174000/382000 [09:29<11:13, 309.05 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:29<10:52, 311.13 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:30<10:45, 313.06 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:31<10:35, 317.95 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:31<10:44, 319.43 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:32<11:13, 304.48 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:32<10:38, 316.58 examples/s] +Running tokenizer on dataset: 46%|████▌ | 175000/382000 [09:32<11:08, 309.88 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:32<10:45, 312.92 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:33<10:35, 316.07 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:34<10:32, 317.75 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:35<10:43, 318.43 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:35<10:34, 317.00 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:35<11:17, 301.13 examples/s] +Running tokenizer on dataset: 46%|████▌ | 176000/382000 [09:35<11:14, 305.61 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:35<10:41, 313.18 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:36<10:36, 314.27 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:37<10:27, 318.97 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:38<10:47, 314.91 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:38<10:28, 318.01 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:38<11:04, 305.49 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:39<10:35, 314.90 examples/s] +Running tokenizer on dataset: 46%|████▋ | 177000/382000 [09:39<11:04, 308.42 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:39<10:32, 314.60 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:40<10:22, 319.66 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:41<10:48, 313.26 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:41<10:26, 317.46 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:42<11:04, 304.11 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:42<10:33, 314.25 examples/s] +Running tokenizer on dataset: 47%|████▋ | 178000/382000 [09:42<11:05, 306.60 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [09:43<10:32, 313.00 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [09:44<10:24, 317.11 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:44<10:39, 315.65 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [09:45<10:26, 315.82 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:45<11:02, 303.57 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [09:45<10:33, 312.76 examples/s] +Running tokenizer on dataset: 47%|████▋ | 179000/382000 [09:45<10:52, 310.97 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [09:46<10:28, 313.57 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [09:47<10:22, 316.40 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:47<10:32, 317.83 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [09:48<10:21, 316.95 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:48<10:42, 311.09 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [09:48<10:27, 313.87 examples/s] +Running tokenizer on dataset: 47%|████▋ | 180000/382000 [09:48<10:58, 306.60 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [09:49<10:22, 314.64 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [09:50<10:22, 315.02 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:50<10:25, 319.58 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [09:51<10:20, 315.75 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:51<10:35, 313.13 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [09:51<10:24, 313.96 examples/s] +Running tokenizer on dataset: 47%|████▋ | 181000/382000 [09:52<11:04, 302.38 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [09:52<10:15, 316.68 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [09:53<10:20, 314.01 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:53<10:22, 319.71 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [09:54<10:19, 314.90 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [09:55<10:41, 308.57 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [09:55<10:26, 311.48 examples/s] +Running tokenizer on dataset: 48%|████▊ | 182000/382000 [09:55<10:38, 313.33 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [09:55<10:18, 313.52 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [09:56<10:16, 314.55 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [09:57<10:20, 319.23 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [09:57<10:17, 314.07 examples/s] +Running tokenizer on dataset: 48%|████▊ | 183000/382000 [09:58<10:28, 316.45 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [09:58<10:26, 309.58 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [09:58<10:49, 303.26 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [09:58<10:14, 314.00 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [10:00<10:10, 316.29 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [10:00<10:14, 320.44 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [10:00<10:13, 314.80 examples/s] +Running tokenizer on dataset: 48%|████▊ | 184000/382000 [10:01<10:25, 316.67 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [10:01<10:17, 312.74 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [10:01<10:47, 302.59 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:01<10:04, 317.56 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:03<10:04, 317.48 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [10:03<10:13, 319.46 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:04<10:05, 317.05 examples/s] +Running tokenizer on dataset: 48%|████▊ | 185000/382000 [10:04<10:22, 316.49 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:04<10:12, 313.55 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:04<09:54, 321.34 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [10:05<11:01, 294.84 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:06<10:00, 318.05 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [10:06<10:18, 315.53 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:07<10:00, 318.24 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:07<10:03, 316.55 examples/s] +Running tokenizer on dataset: 49%|████▊ | 186000/382000 [10:07<10:21, 315.56 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:08<09:48, 322.71 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [10:08<10:46, 299.97 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:09<09:55, 319.06 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [10:09<10:13, 316.18 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:10<09:54, 319.47 examples/s] +Running tokenizer on dataset: 49%|████▉ | 187000/382000 [10:10<10:13, 317.74 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:10<10:00, 316.66 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:11<09:44, 323.36 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [10:12<10:48, 297.65 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:12<09:52, 319.20 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [10:12<10:07, 317.84 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:13<09:55, 317.54 examples/s] +Running tokenizer on dataset: 49%|████▉ | 188000/382000 [10:14<10:07, 319.34 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:14<09:56, 316.89 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:14<09:38, 325.06 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:15<10:36, 301.44 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:15<09:47, 320.14 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:15<09:59, 320.43 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:16<09:50, 318.61 examples/s] +Running tokenizer on dataset: 49%|████▉ | 189000/382000 [10:17<10:01, 320.83 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:17<09:34, 325.42 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:17<09:52, 317.04 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:18<10:34, 301.04 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:18<09:47, 318.56 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:18<09:54, 321.43 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:19<09:49, 317.08 examples/s] +Running tokenizer on dataset: 50%|████▉ | 190000/382000 [10:20<09:55, 322.19 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:20<09:29, 326.48 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:20<09:54, 314.78 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:21<10:25, 303.98 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:21<09:42, 319.41 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:22<09:48, 323.03 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:22<09:43, 318.51 examples/s] +Running tokenizer on dataset: 50%|█████ | 191000/382000 [10:23<09:49, 324.21 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:23<09:25, 326.98 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:23<09:48, 316.16 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:24<10:10, 309.34 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:25<09:38, 319.77 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:25<09:46, 322.43 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:26<09:39, 319.02 examples/s] +Running tokenizer on dataset: 50%|█████ | 192000/382000 [10:26<09:42, 326.04 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:26<09:27, 324.50 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:26<09:43, 316.85 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:28<10:03, 311.74 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:28<09:41, 323.08 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:28<09:38, 318.25 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:29<09:40, 317.23 examples/s] +Running tokenizer on dataset: 51%|█████ | 193000/382000 [10:29<09:39, 326.04 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:29<09:14, 329.86 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:30<09:44, 314.89 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:31<09:25, 323.61 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:31<10:00, 311.22 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:31<09:38, 323.17 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:32<09:27, 322.40 examples/s] +Running tokenizer on dataset: 51%|█████ | 194000/382000 [10:32<09:40, 323.96 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:32<09:15, 327.40 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:33<09:44, 313.25 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:34<09:27, 320.99 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:34<09:33, 324.11 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:34<10:02, 308.64 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:35<09:28, 320.14 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:35<09:11, 328.38 examples/s] +Running tokenizer on dataset: 51%|█████ | 195000/382000 [10:35<09:39, 322.71 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:36<09:44, 311.59 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:37<09:20, 322.84 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:37<09:30, 324.01 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:37<09:50, 313.20 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:38<09:25, 320.17 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:38<09:08, 328.09 examples/s] +Running tokenizer on dataset: 51%|█████▏ | 196000/382000 [10:38<09:32, 325.13 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:39<09:33, 315.55 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:40<09:19, 321.61 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:40<09:31, 321.71 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:40<09:43, 315.20 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [10:41<09:06, 327.34 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:41<09:23, 319.61 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 197000/382000 [10:41<09:28, 325.56 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:42<09:29, 316.14 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:43<09:22, 325.30 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:43<09:28, 321.81 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [10:43<09:18, 320.33 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [10:44<09:03, 327.24 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 198000/382000 [10:44<09:25, 325.19 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [10:44<09:21, 318.81 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [10:45<09:25, 316.67 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:46<09:22, 323.60 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [10:46<09:14, 321.28 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:46<09:27, 320.95 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 199000/382000 [10:47<09:15, 329.24 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [10:47<08:59, 328.22 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [10:47<09:17, 319.04 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [10:49<09:25, 314.69 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:49<09:16, 325.27 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:49<09:19, 323.27 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [10:49<09:11, 321.10 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [10:50<08:56, 328.10 examples/s] +Running tokenizer on dataset: 52%|█████▏ | 200000/382000 [10:50<09:17, 326.18 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [10:51<09:14, 319.28 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [10:52<09:20, 315.56 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:52<09:13, 325.07 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:52<09:14, 324.42 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [10:53<09:13, 318.13 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [10:53<08:54, 327.38 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 201000/382000 [10:53<09:13, 326.89 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [10:54<09:12, 318.38 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [10:55<09:19, 314.52 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [10:56<09:14, 322.97 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [10:56<09:11, 324.83 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [10:56<09:10, 317.75 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 202000/382000 [10:56<09:10, 326.98 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [10:57<08:59, 322.77 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [10:57<09:09, 318.25 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [10:58<09:13, 316.13 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [10:59<09:09, 323.81 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [10:59<09:12, 322.22 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [10:59<09:13, 314.26 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 203000/382000 [11:00<09:07, 326.67 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:00<08:55, 322.84 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [11:00<09:11, 315.28 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [11:01<09:14, 313.85 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [11:02<09:06, 323.59 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [11:02<09:09, 322.22 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:02<09:10, 314.31 examples/s] +Running tokenizer on dataset: 53%|█████▎ | 204000/382000 [11:03<09:07, 324.87 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:03<08:55, 321.35 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:03<09:05, 317.24 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:04<09:07, 315.70 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [11:05<09:03, 323.58 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [11:05<09:06, 321.87 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:05<09:08, 313.67 examples/s] +Running tokenizer on dataset: 54%|█████▎ | 205000/382000 [11:06<09:06, 323.91 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:06<08:49, 322.94 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:06<09:03, 316.64 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:08<09:02, 316.89 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [11:08<09:00, 323.49 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [11:08<09:01, 323.14 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:09<09:04, 313.89 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 206000/382000 [11:09<09:05, 322.66 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:09<08:50, 320.75 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:10<09:00, 316.47 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:11<09:01, 316.05 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [11:11<09:03, 319.97 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [11:11<09:02, 320.54 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:12<09:03, 312.54 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 207000/382000 [11:12<09:02, 322.67 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:12<08:46, 321.08 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:13<09:00, 314.46 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:14<09:06, 311.15 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:14<08:59, 320.53 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:14<08:58, 321.32 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:15<09:00, 312.59 examples/s] +Running tokenizer on dataset: 54%|█████▍ | 208000/382000 [11:15<09:01, 321.21 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:15<08:45, 319.86 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:16<08:57, 314.68 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:17<08:59, 313.17 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:17<08:57, 319.82 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:17<08:57, 320.07 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 209000/382000 [11:18<08:57, 321.77 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:18<08:58, 311.79 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:18<08:42, 319.45 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:19<08:54, 314.10 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:20<08:52, 321.07 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:20<08:58, 312.25 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:21<08:55, 319.45 examples/s] +Running tokenizer on dataset: 55%|█████▍ | 210000/382000 [11:21<08:56, 320.80 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:21<08:51, 314.26 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:21<08:36, 321.21 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:22<08:49, 315.61 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:24<08:50, 315.06 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:24<08:50, 320.44 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:24<08:54, 317.76 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 211000/382000 [11:24<08:54, 319.78 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:25<08:47, 314.71 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:25<08:33, 321.50 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:25<08:43, 317.33 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:27<08:44, 316.49 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:27<08:46, 320.75 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:27<08:48, 319.76 examples/s] +Running tokenizer on dataset: 55%|█████▌ | 212000/382000 [11:28<08:54, 318.02 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:28<08:31, 320.76 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:28<08:47, 312.93 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:29<08:40, 317.30 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:30<08:41, 316.63 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:30<08:46, 319.02 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:30<08:47, 318.69 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 213000/382000 [11:31<08:50, 318.84 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:31<08:31, 318.93 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:31<08:47, 310.68 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:32<08:38, 316.42 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:33<08:42, 319.42 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:33<08:38, 316.37 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:33<08:43, 318.91 examples/s] +Running tokenizer on dataset: 56%|█████▌ | 214000/382000 [11:34<08:47, 318.74 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:34<08:23, 321.47 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:34<08:43, 311.16 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:35<08:36, 315.66 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:36<08:36, 321.65 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:36<08:37, 320.69 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:36<08:41, 312.69 examples/s] +Running tokenizer on dataset: 56%|█████▋ | 215000/382000 [11:37<08:43, 318.72 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:37<08:22, 320.17 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:37<08:36, 313.61 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:38<08:29, 318.04 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:39<08:32, 322.24 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:39<08:31, 316.83 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:39<08:37, 319.00 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 216000/382000 [11:40<08:39, 319.25 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:40<08:31, 312.91 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:41<08:34, 312.91 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:41<08:28, 316.81 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:42<08:32, 320.24 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:42<08:33, 319.50 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:43<08:31, 314.66 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 217000/382000 [11:43<08:35, 320.38 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [11:43<08:17, 319.48 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:44<08:47, 303.42 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:45<08:36, 310.01 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:45<08:33, 317.28 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:46<08:33, 317.67 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:46<08:38, 308.84 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [11:46<08:06, 324.59 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 218000/382000 [11:46<08:33, 319.62 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [11:47<08:36, 307.80 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [11:48<08:27, 313.15 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:49<08:27, 319.47 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:49<08:24, 320.81 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [11:49<08:26, 313.95 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [11:50<08:03, 324.53 examples/s] +Running tokenizer on dataset: 57%|█████▋ | 219000/382000 [11:50<08:31, 318.78 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [11:50<08:20, 315.71 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [11:51<08:14, 319.31 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:52<08:23, 319.62 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:52<08:23, 320.05 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [11:52<08:15, 319.05 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [11:53<08:04, 322.19 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 220000/382000 [11:53<08:25, 320.26 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [11:53<08:17, 315.42 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [11:54<08:12, 318.97 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [11:55<08:10, 320.19 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:55<08:34, 310.80 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:55<08:31, 312.65 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [11:56<07:58, 324.15 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 221000/382000 [11:56<08:22, 320.27 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [11:57<08:13, 315.99 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [11:57<08:11, 317.51 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [11:58<08:22, 316.68 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [11:58<08:19, 318.07 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [11:58<08:07, 320.25 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [11:59<07:56, 322.90 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 222000/382000 [11:59<08:30, 313.33 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [12:00<08:12, 314.89 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [12:00<08:08, 317.03 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [12:01<08:11, 321.34 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [12:01<08:07, 324.00 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [12:01<08:06, 318.59 examples/s] +Running tokenizer on dataset: 58%|█████▊ | 223000/382000 [12:02<08:18, 319.08 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:02<08:06, 314.63 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [12:03<08:06, 316.27 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [12:03<08:02, 319.12 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [12:04<08:07, 321.90 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [12:04<08:04, 323.86 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [12:04<08:00, 320.59 examples/s] +Running tokenizer on dataset: 59%|█████▊ | 224000/382000 [12:05<08:05, 325.24 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:05<08:04, 313.86 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:06<08:04, 315.89 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:06<07:59, 319.27 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [12:07<08:03, 322.52 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [12:07<08:06, 320.62 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:08<07:56, 321.43 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 225000/382000 [12:08<08:03, 324.73 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:09<07:59, 314.80 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:09<08:03, 314.59 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:10<07:58, 317.74 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [12:10<07:59, 323.21 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [12:10<07:59, 322.99 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:11<07:52, 321.62 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 226000/382000 [12:11<08:02, 323.36 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:12<07:55, 315.38 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:12<07:57, 316.55 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:13<07:55, 317.42 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [12:13<07:52, 325.63 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [12:14<07:56, 323.39 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:14<07:50, 321.10 examples/s] +Running tokenizer on dataset: 59%|█████▉ | 227000/382000 [12:14<07:59, 322.96 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:15<07:51, 315.74 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:16<07:57, 313.84 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:16<07:55, 315.42 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:17<07:50, 325.04 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:17<07:53, 323.21 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:17<07:48, 320.01 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 228000/382000 [12:17<07:55, 324.19 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:18<07:48, 315.76 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:19<07:55, 313.44 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:19<07:52, 315.39 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:20<07:49, 323.44 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:20<07:50, 323.08 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:20<07:47, 318.86 examples/s] +Running tokenizer on dataset: 60%|█████▉ | 229000/382000 [12:21<07:51, 324.59 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:21<07:38, 320.81 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:22<07:55, 310.97 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:22<07:50, 314.40 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:23<07:49, 321.91 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:23<07:50, 320.91 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:23<07:45, 317.95 examples/s] +Running tokenizer on dataset: 60%|██████ | 230000/382000 [12:24<07:50, 322.97 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:24<07:33, 322.14 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:25<07:47, 314.65 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:25<07:43, 317.46 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:26<07:47, 320.91 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:26<07:48, 320.04 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:26<07:36, 322.14 examples/s] +Running tokenizer on dataset: 60%|██████ | 231000/382000 [12:27<07:49, 321.90 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:27<07:25, 325.36 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:28<07:39, 317.44 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:29<07:39, 317.76 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:29<07:45, 320.43 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:29<07:47, 319.02 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:29<07:31, 323.17 examples/s] +Running tokenizer on dataset: 61%|██████ | 232000/382000 [12:30<07:47, 320.80 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:30<07:25, 323.36 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:31<07:36, 317.29 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:32<07:33, 319.43 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:32<07:42, 320.13 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:32<07:43, 319.36 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:32<07:27, 324.21 examples/s] +Running tokenizer on dataset: 61%|██████ | 233000/382000 [12:33<07:46, 319.60 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:33<07:18, 325.82 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:35<07:34, 316.49 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:35<07:33, 317.52 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:35<07:33, 324.44 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:35<07:33, 324.38 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:36<07:27, 321.77 examples/s] +Running tokenizer on dataset: 61%|██████▏ | 234000/382000 [12:36<07:42, 319.85 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:36<07:12, 328.67 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:38<07:28, 318.62 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:38<07:28, 319.16 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:38<07:29, 324.62 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:38<07:28, 325.55 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:39<07:20, 324.44 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 235000/382000 [12:39<07:32, 324.83 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:39<07:07, 329.98 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:41<07:22, 320.84 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:41<07:21, 321.45 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:41<07:25, 325.30 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:41<07:23, 327.10 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:42<07:13, 327.55 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 236000/382000 [12:42<07:27, 326.02 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [12:42<07:04, 329.67 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:44<07:17, 321.98 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:44<07:18, 321.80 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:44<07:25, 323.28 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:45<07:25, 323.19 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:45<07:10, 327.64 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 237000/382000 [12:45<07:23, 326.61 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [12:45<07:00, 330.52 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [12:47<07:13, 322.89 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [12:47<07:15, 321.34 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:47<07:17, 326.76 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:48<07:18, 325.87 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [12:48<07:07, 327.19 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [12:48<07:00, 328.02 examples/s] +Running tokenizer on dataset: 62%|██████▏ | 238000/382000 [12:48<07:25, 323.39 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [12:50<07:11, 322.49 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [12:50<07:11, 322.48 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:50<07:11, 329.08 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:51<07:11, 328.76 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [12:51<07:04, 327.53 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [12:51<06:57, 328.07 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 239000/382000 [12:52<07:20, 324.35 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [12:53<07:13, 318.17 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [12:53<07:10, 320.50 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:53<07:08, 329.24 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:54<07:07, 329.64 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [12:54<07:03, 325.66 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 240000/382000 [12:54<07:12, 327.98 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [12:55<06:57, 325.50 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [12:56<07:08, 319.87 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [12:57<07:06, 321.21 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [12:57<07:07, 327.59 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [12:57<07:06, 328.39 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [12:57<06:59, 326.68 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 241000/382000 [12:57<07:07, 329.61 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [12:58<06:53, 326.49 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [13:00<07:03, 328.23 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [13:00<07:09, 316.47 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [13:00<07:04, 327.20 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [13:00<07:08, 317.29 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [13:00<07:01, 322.65 examples/s] +Running tokenizer on dataset: 63%|██████▎ | 242000/382000 [13:01<07:07, 327.84 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:01<06:47, 329.04 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [13:03<07:02, 326.40 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [13:03<07:04, 324.78 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [13:03<07:08, 314.98 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [13:03<07:05, 317.32 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [13:03<06:58, 322.23 examples/s] +Running tokenizer on dataset: 64%|██████▎ | 243000/382000 [13:04<07:03, 328.29 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:04<06:47, 326.03 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [13:06<06:59, 326.85 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [13:06<06:59, 326.73 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:06<06:59, 319.13 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:06<07:04, 315.30 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:06<06:54, 323.03 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 244000/382000 [13:07<07:05, 324.58 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:07<06:48, 322.88 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [13:09<06:59, 324.03 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [13:09<06:58, 324.94 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:09<06:58, 317.68 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:09<07:03, 314.26 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:09<06:54, 321.17 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 245000/382000 [13:10<06:59, 326.82 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:10<06:45, 322.95 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [13:12<06:58, 322.85 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [13:12<06:55, 324.94 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:12<07:00, 314.10 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:13<07:03, 311.53 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:13<06:53, 319.06 examples/s] +Running tokenizer on dataset: 64%|██████▍ | 246000/382000 [13:13<07:00, 323.65 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:13<06:43, 322.17 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:15<06:54, 323.61 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:15<06:50, 326.62 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:16<06:57, 314.07 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:16<07:00, 311.23 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:16<06:50, 319.15 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 247000/382000 [13:16<06:57, 323.38 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:16<06:36, 324.98 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:18<06:50, 323.74 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:18<06:48, 325.23 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:19<06:52, 315.43 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:19<06:47, 318.86 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:19<06:59, 309.55 examples/s] +Running tokenizer on dataset: 65%|██████▍ | 248000/382000 [13:19<06:52, 324.94 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:19<06:34, 324.24 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:21<06:50, 321.73 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:21<06:50, 321.92 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:22<06:46, 317.21 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:22<06:43, 319.90 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:22<06:55, 310.39 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 249000/382000 [13:22<06:52, 322.45 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:22<06:29, 325.87 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:24<06:45, 323.10 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:24<06:48, 320.93 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:25<06:44, 316.31 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:25<06:39, 320.15 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:25<06:25, 326.70 examples/s] +Running tokenizer on dataset: 65%|██████▌ | 250000/382000 [13:25<06:50, 321.76 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:25<06:51, 310.95 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:28<06:41, 323.46 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:28<06:46, 319.86 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:28<06:33, 322.68 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:28<06:38, 318.36 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 251000/382000 [13:28<06:46, 322.33 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:28<06:25, 324.15 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:29<06:47, 311.43 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:31<06:36, 325.16 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:31<06:42, 320.33 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:31<06:31, 322.15 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:31<06:37, 317.03 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:31<06:20, 326.15 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 252000/382000 [13:32<06:43, 322.54 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:32<06:43, 311.91 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:34<06:35, 323.74 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:34<06:39, 320.68 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:34<06:30, 319.75 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:35<06:17, 326.10 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:35<06:35, 315.70 examples/s] +Running tokenizer on dataset: 66%|██████▌ | 253000/382000 [13:35<06:39, 322.65 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:36<07:03, 294.99 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:37<06:29, 326.30 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:37<06:33, 323.01 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:38<06:27, 320.34 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:38<06:13, 326.99 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:38<06:30, 317.93 examples/s] +Running tokenizer on dataset: 66%|██████▋ | 254000/382000 [13:38<06:37, 322.36 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:39<06:50, 302.16 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:40<06:24, 327.29 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:40<06:34, 319.70 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:41<06:09, 327.82 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:41<06:23, 320.63 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 255000/382000 [13:41<06:31, 324.05 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:41<06:28, 316.25 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:42<06:42, 305.67 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:43<06:24, 325.00 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:43<06:36, 315.47 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [13:44<06:08, 325.73 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:44<06:19, 321.62 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 256000/382000 [13:44<06:28, 324.64 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:44<06:25, 316.35 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:45<06:35, 308.78 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:46<06:19, 326.55 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:46<06:30, 317.37 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [13:47<06:06, 325.08 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:47<06:14, 323.10 examples/s] +Running tokenizer on dataset: 67%|██████▋ | 257000/382000 [13:47<06:26, 323.26 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:47<06:20, 318.28 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:48<06:25, 313.67 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:49<06:17, 325.61 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:50<06:26, 318.01 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [13:50<06:03, 324.51 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [13:50<06:14, 320.58 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 258000/382000 [13:50<06:21, 325.07 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [13:50<06:17, 317.71 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [13:52<06:32, 305.45 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:52<06:15, 325.13 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:53<06:23, 317.90 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [13:53<06:03, 322.01 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [13:53<06:11, 320.49 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 259000/382000 [13:53<06:19, 323.90 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [13:53<06:15, 317.14 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:55<06:09, 327.44 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [13:55<06:49, 290.32 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:56<06:18, 319.61 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [13:56<06:01, 320.92 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 260000/382000 [13:56<06:17, 323.22 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [13:56<06:08, 320.46 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [13:57<06:12, 317.04 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [13:58<06:07, 326.84 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [13:59<06:14, 320.09 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [13:59<05:56, 322.77 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [13:59<06:59, 281.39 examples/s] +Running tokenizer on dataset: 68%|██████▊ | 261000/382000 [13:59<06:12, 324.92 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [13:59<06:07, 317.95 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [14:00<06:13, 313.62 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [14:01<06:03, 327.18 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [14:02<06:14, 317.77 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:02<05:51, 324.07 examples/s] +Running tokenizer on dataset: 69%|██████▊ | 262000/382000 [14:02<06:09, 324.57 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [14:02<06:43, 290.16 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [14:03<06:05, 317.75 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [14:03<06:09, 314.00 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [14:04<06:02, 325.45 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [14:05<06:10, 318.72 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:05<05:49, 323.24 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 263000/382000 [14:05<06:07, 324.04 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [14:06<06:30, 297.41 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [14:06<06:00, 319.04 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [14:06<06:04, 315.74 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [14:07<06:03, 321.87 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [14:08<06:07, 318.18 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 264000/382000 [14:09<06:03, 324.56 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:09<05:46, 322.84 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:09<05:55, 321.10 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [14:09<06:26, 297.84 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:09<05:59, 317.35 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [14:11<06:00, 321.65 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [14:12<06:04, 318.57 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:12<05:41, 324.81 examples/s] +Running tokenizer on dataset: 69%|██████▉ | 265000/382000 [14:12<06:04, 321.16 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:12<05:53, 320.02 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:12<06:17, 302.11 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:12<05:57, 316.02 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [14:14<05:55, 323.58 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:15<05:38, 325.24 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [14:15<06:01, 318.49 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 266000/382000 [14:15<06:01, 320.73 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:15<05:50, 319.29 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:15<06:13, 302.68 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:16<05:56, 314.09 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:17<05:49, 326.37 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:18<05:35, 325.05 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:18<05:56, 320.15 examples/s] +Running tokenizer on dataset: 70%|██████▉ | 267000/382000 [14:18<05:56, 322.32 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:18<05:46, 320.35 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:19<06:07, 304.75 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:19<05:50, 316.46 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:20<05:47, 324.87 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:21<05:30, 327.08 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:21<05:52, 320.56 examples/s] +Running tokenizer on dataset: 70%|███████ | 268000/382000 [14:21<05:52, 323.16 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:21<05:42, 321.30 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:22<05:46, 317.66 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:22<06:05, 303.62 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:23<05:45, 323.79 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:24<05:24, 329.74 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:24<05:49, 320.04 examples/s] +Running tokenizer on dataset: 70%|███████ | 269000/382000 [14:24<05:51, 321.78 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:24<05:39, 321.23 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:25<05:43, 317.34 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:25<05:56, 308.44 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:26<05:42, 323.73 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:27<05:24, 326.15 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:27<05:44, 321.99 examples/s] +Running tokenizer on dataset: 71%|███████ | 270000/382000 [14:27<05:49, 320.34 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:27<05:33, 323.74 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:28<05:38, 319.50 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:28<05:53, 308.45 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:29<05:37, 326.39 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:30<05:21, 326.18 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:30<05:42, 320.94 examples/s] +Running tokenizer on dataset: 71%|███████ | 271000/382000 [14:30<05:45, 321.40 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:30<05:28, 325.95 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:31<05:33, 321.13 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:31<05:44, 313.87 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:32<05:33, 326.76 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:33<05:19, 325.58 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:33<05:39, 321.26 examples/s] +Running tokenizer on dataset: 71%|███████ | 272000/382000 [14:33<05:41, 321.88 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:34<05:29, 321.54 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:34<05:34, 317.24 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:35<05:38, 316.17 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:35<05:28, 328.46 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:36<05:17, 323.99 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:36<05:35, 322.05 examples/s] +Running tokenizer on dataset: 71%|███████▏ | 273000/382000 [14:37<05:37, 322.61 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:37<05:26, 321.44 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:38<05:30, 317.62 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:38<05:37, 314.15 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:38<05:24, 329.36 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:39<05:17, 320.90 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:39<05:31, 323.04 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 274000/382000 [14:40<05:32, 324.78 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:40<05:23, 321.01 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:41<05:27, 317.19 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:41<05:32, 315.50 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:41<05:25, 325.66 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [14:42<05:14, 320.77 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 275000/382000 [14:43<05:27, 326.99 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:43<05:32, 318.57 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:43<05:22, 319.84 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:44<05:26, 315.21 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:44<05:28, 316.20 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:44<05:22, 325.14 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [14:46<05:11, 320.63 examples/s] +Running tokenizer on dataset: 72%|███████▏ | 276000/382000 [14:46<05:29, 321.77 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:46<05:30, 317.86 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:46<05:21, 317.70 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:47<05:25, 313.41 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:47<05:26, 315.10 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:47<05:18, 326.21 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [14:49<05:10, 319.28 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 277000/382000 [14:49<05:26, 321.29 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:49<05:25, 319.92 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [14:49<05:19, 315.77 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:50<05:17, 324.78 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:50<05:24, 314.41 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [14:50<05:23, 312.26 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [14:52<05:03, 322.81 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 278000/382000 [14:52<05:23, 321.34 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:52<05:23, 318.27 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [14:53<05:16, 316.20 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:54<05:17, 321.76 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [14:54<05:21, 311.38 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [14:54<05:23, 311.87 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [14:55<05:02, 321.19 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 279000/382000 [14:55<05:21, 320.47 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:55<05:24, 314.26 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [14:56<05:13, 315.39 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [14:57<05:15, 319.85 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [14:57<05:19, 313.04 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [14:57<05:17, 311.56 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [14:58<04:56, 324.14 examples/s] +Running tokenizer on dataset: 73%|███████▎ | 280000/382000 [14:58<05:19, 318.95 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [14:59<05:23, 311.73 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [14:59<05:06, 319.43 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [15:00<05:12, 320.07 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [15:00<05:11, 315.08 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [15:00<05:16, 312.37 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:01<04:49, 327.94 examples/s] +Running tokenizer on dataset: 74%|███████▎ | 281000/382000 [15:02<05:19, 316.58 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [15:02<05:20, 311.95 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [15:02<05:04, 318.25 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [15:03<05:08, 321.04 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [15:03<05:09, 316.20 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [15:03<05:08, 314.08 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:04<04:48, 326.29 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 282000/382000 [15:05<05:15, 317.25 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [15:05<05:14, 314.35 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [15:05<04:59, 320.77 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [15:06<05:01, 325.35 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [15:06<05:02, 317.40 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [15:06<05:10, 312.05 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:07<04:45, 325.93 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 283000/382000 [15:08<05:12, 316.85 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [15:08<05:08, 318.04 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:08<04:53, 323.15 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [15:09<04:59, 324.28 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:09<04:58, 318.26 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [15:10<05:06, 312.76 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:10<04:42, 325.54 examples/s] +Running tokenizer on dataset: 74%|███████▍ | 284000/382000 [15:11<05:05, 321.05 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:11<04:50, 323.29 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [15:11<05:05, 317.52 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [15:12<04:54, 325.57 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:13<04:55, 318.44 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:13<05:01, 315.02 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:13<04:37, 327.43 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 285000/382000 [15:14<05:02, 320.36 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [15:14<05:00, 319.85 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:14<04:48, 322.08 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:15<04:50, 327.18 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:16<04:53, 317.29 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:16<04:59, 313.60 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:16<04:34, 327.50 examples/s] +Running tokenizer on dataset: 75%|███████▍ | 286000/382000 [15:17<04:59, 320.79 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:17<04:54, 322.81 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:17<04:44, 322.91 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:18<04:48, 326.30 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:19<04:49, 318.30 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:19<04:32, 326.31 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:19<04:58, 311.25 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 287000/382000 [15:20<04:53, 323.16 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:20<04:40, 324.73 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:20<04:52, 321.63 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:21<04:45, 326.06 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:22<04:44, 320.16 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:22<04:32, 322.88 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:22<04:54, 312.28 examples/s] +Running tokenizer on dataset: 75%|███████▌ | 288000/382000 [15:23<04:52, 321.36 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:24<04:37, 324.34 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:24<04:48, 322.56 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:24<04:42, 325.77 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:25<04:41, 319.93 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:26<04:47, 316.12 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:26<04:30, 321.41 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 289000/382000 [15:26<04:49, 321.14 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:27<04:43, 323.95 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:27<04:35, 322.56 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:27<04:38, 326.96 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:28<04:38, 319.26 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:29<04:25, 324.36 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:29<04:43, 317.53 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 290000/382000 [15:30<04:45, 322.00 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:30<04:39, 325.22 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:30<04:34, 320.42 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:30<04:35, 326.79 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:31<04:37, 316.58 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:32<04:22, 323.52 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:32<04:41, 316.05 examples/s] +Running tokenizer on dataset: 76%|███████▌ | 291000/382000 [15:33<04:41, 323.35 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:33<04:36, 325.18 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:33<04:31, 320.11 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:34<04:33, 325.67 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:35<04:35, 316.05 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:35<04:20, 323.05 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:35<04:38, 316.08 examples/s] +Running tokenizer on dataset: 76%|███████▋ | 292000/382000 [15:36<04:38, 323.42 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:36<04:35, 323.61 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:36<04:26, 322.46 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:37<04:31, 324.16 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:38<04:30, 318.44 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:38<04:16, 323.09 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:38<04:36, 315.20 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 293000/382000 [15:39<04:36, 321.98 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:39<04:33, 322.17 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:39<04:23, 322.02 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:40<04:28, 323.65 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:41<04:27, 317.57 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [15:41<04:12, 324.70 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:41<04:31, 316.27 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 294000/382000 [15:42<04:35, 319.37 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:42<04:30, 322.13 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:42<04:20, 321.85 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:43<04:23, 326.27 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:44<04:25, 316.97 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [15:44<04:08, 325.65 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:44<04:27, 317.78 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:45<04:24, 325.16 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 295000/382000 [15:45<04:33, 317.94 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:45<04:17, 321.88 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:46<04:21, 324.64 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [15:47<04:04, 327.39 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:47<04:21, 317.61 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:48<04:24, 317.20 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:48<04:22, 324.12 examples/s] +Running tokenizer on dataset: 77%|███████▋ | 296000/382000 [15:48<04:27, 322.02 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [15:48<04:12, 324.77 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:49<04:19, 324.32 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [15:50<03:59, 330.41 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [15:50<04:17, 317.93 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:51<04:22, 316.66 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 297000/382000 [15:51<04:24, 321.17 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:51<04:20, 322.05 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [15:51<04:08, 325.34 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:52<04:15, 324.22 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [15:53<03:56, 329.19 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [15:53<04:13, 319.95 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [15:54<04:17, 319.01 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:54<04:16, 323.10 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 298000/382000 [15:54<04:21, 320.98 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [15:54<04:06, 325.19 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [15:55<04:10, 327.42 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [15:56<03:54, 327.81 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [15:57<04:11, 318.58 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [15:57<04:11, 321.87 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [15:57<04:12, 325.35 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [15:58<04:01, 326.60 examples/s] +Running tokenizer on dataset: 78%|███████▊ | 299000/382000 [15:58<04:19, 320.25 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [15:58<04:05, 329.79 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [15:59<03:51, 328.14 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [16:00<04:07, 319.77 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [16:00<04:09, 320.40 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [16:00<04:07, 327.06 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [16:01<03:58, 326.62 examples/s] +Running tokenizer on dataset: 79%|███████▊ | 300000/382000 [16:01<04:14, 322.52 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [16:01<04:03, 328.49 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:02<03:50, 325.10 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [16:03<04:04, 319.31 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [16:03<04:06, 320.69 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [16:04<04:04, 326.93 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 301000/382000 [16:04<04:08, 325.51 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [16:04<03:56, 325.72 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [16:04<03:58, 330.58 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:05<03:46, 326.38 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [16:06<04:02, 317.64 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [16:06<04:04, 318.65 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [16:07<04:01, 327.57 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [16:07<03:52, 326.35 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 302000/382000 [16:07<04:06, 324.47 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [16:07<03:56, 329.28 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:08<03:45, 323.92 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [16:09<03:58, 318.68 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [16:10<04:02, 318.16 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [16:10<03:58, 326.90 examples/s] +Running tokenizer on dataset: 79%|███████▉ | 303000/382000 [16:10<04:01, 326.75 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:10<03:52, 322.32 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [16:10<03:54, 328.94 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:12<03:41, 324.78 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:12<03:56, 316.70 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [16:13<03:58, 318.52 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [16:13<03:55, 326.30 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 304000/382000 [16:13<03:59, 325.57 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:13<03:49, 322.19 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [16:13<03:49, 330.83 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:15<03:38, 324.50 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:15<03:53, 316.54 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [16:16<03:51, 327.60 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:16<03:58, 314.89 examples/s] +Running tokenizer on dataset: 80%|███████▉ | 305000/382000 [16:16<03:57, 324.80 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:16<03:47, 320.82 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:16<03:49, 326.21 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:18<03:36, 322.76 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:19<03:51, 315.31 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:19<03:51, 323.80 examples/s] +Running tokenizer on dataset: 80%|████████ | 306000/382000 [16:19<03:53, 326.12 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:19<03:53, 316.47 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:19<03:42, 323.31 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:19<03:47, 325.67 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:21<03:33, 323.27 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:22<03:47, 317.13 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:22<03:48, 323.41 examples/s] +Running tokenizer on dataset: 80%|████████ | 307000/382000 [16:22<03:53, 321.42 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:22<03:51, 314.69 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:22<03:39, 322.73 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:23<03:46, 322.19 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:24<03:29, 324.32 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:25<03:43, 317.92 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:25<03:47, 321.44 examples/s] +Running tokenizer on dataset: 81%|████████ | 308000/382000 [16:25<03:49, 321.80 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:25<03:48, 314.81 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:25<03:37, 321.23 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:26<03:41, 325.28 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:27<03:25, 325.90 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:28<03:40, 317.37 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:28<03:42, 322.93 examples/s] +Running tokenizer on dataset: 81%|████████ | 309000/382000 [16:28<03:48, 320.10 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:29<03:45, 314.80 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:29<03:35, 320.39 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:29<03:37, 325.87 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:30<03:22, 325.82 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:31<03:37, 317.94 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:31<03:39, 323.88 examples/s] +Running tokenizer on dataset: 81%|████████ | 310000/382000 [16:32<03:44, 321.21 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:32<03:30, 322.54 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:32<03:34, 325.92 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:32<03:42, 314.35 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:33<03:20, 324.54 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:34<03:33, 317.99 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:34<03:36, 322.59 examples/s] +Running tokenizer on dataset: 81%|████████▏ | 311000/382000 [16:35<03:40, 321.29 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:35<03:26, 323.99 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:35<03:32, 324.76 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:35<03:38, 315.62 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:36<03:17, 323.73 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:37<03:29, 319.33 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:38<03:34, 322.31 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 312000/382000 [16:38<03:37, 321.13 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:38<03:23, 323.92 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:38<03:29, 325.28 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:38<03:35, 315.75 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [16:39<03:14, 324.06 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:41<03:25, 320.68 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:41<03:30, 322.74 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 313000/382000 [16:41<03:34, 321.70 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:41<03:21, 323.01 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:41<03:25, 326.70 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:41<03:31, 316.42 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [16:42<03:11, 324.10 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:44<03:26, 324.88 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:44<03:23, 319.39 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:44<03:21, 326.93 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 314000/382000 [16:44<03:31, 321.15 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:44<03:18, 322.48 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:44<03:27, 318.00 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [16:46<03:09, 321.44 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:47<03:22, 325.22 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:47<03:20, 319.66 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:47<03:19, 326.38 examples/s] +Running tokenizer on dataset: 82%|████████▏ | 315000/382000 [16:47<03:28, 322.00 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [16:47<03:15, 321.87 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:48<03:26, 314.69 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [16:49<03:07, 320.00 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:50<03:21, 323.32 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [16:50<03:18, 317.32 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:50<03:16, 326.23 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 316000/382000 [16:50<03:24, 322.06 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [16:50<03:13, 321.18 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:51<03:23, 314.93 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [16:52<03:03, 320.95 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:53<03:17, 323.95 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [16:53<03:14, 317.95 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [16:53<03:14, 324.60 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 317000/382000 [16:53<03:22, 321.47 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [16:53<03:11, 319.34 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [16:54<03:20, 313.86 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [16:55<03:00, 321.30 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [16:56<03:16, 321.35 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [16:56<03:13, 315.76 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [16:56<03:11, 323.42 examples/s] +Running tokenizer on dataset: 83%|████████▎ | 318000/382000 [16:56<03:19, 321.46 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [16:57<03:09, 317.28 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [16:57<03:18, 312.85 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [16:58<02:53, 328.60 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [16:59<03:13, 320.40 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [16:59<03:09, 322.57 examples/s] +Running tokenizer on dataset: 84%|████████▎ | 319000/382000 [17:00<03:16, 320.42 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [17:00<03:11, 313.35 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [17:00<03:06, 316.91 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [17:00<03:15, 312.25 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:01<02:50, 329.16 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [17:02<03:10, 319.64 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [17:03<03:06, 321.52 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [17:03<03:07, 314.65 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 320000/382000 [17:03<03:14, 319.33 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [17:03<03:01, 318.76 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [17:04<03:12, 311.26 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:04<02:48, 326.31 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [17:06<03:08, 318.31 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [17:06<03:03, 321.79 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [17:06<02:55, 325.23 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 321000/382000 [17:06<03:11, 318.12 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [17:06<03:04, 314.09 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [17:07<03:09, 311.99 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:07<02:44, 328.23 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [17:09<03:04, 319.55 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [17:09<02:59, 322.37 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [17:09<02:57, 321.51 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:09<02:52, 324.80 examples/s] +Running tokenizer on dataset: 84%|████████▍ | 322000/382000 [17:09<03:09, 316.91 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [17:10<03:04, 314.64 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:10<02:42, 326.08 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [17:12<02:53, 328.18 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [17:12<03:01, 318.90 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:12<02:54, 321.18 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:12<02:50, 322.63 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 323000/382000 [17:12<03:05, 317.49 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [17:13<02:57, 320.64 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:13<02:39, 325.80 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [17:15<02:55, 324.91 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:15<02:51, 327.29 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:15<02:46, 324.97 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:15<02:51, 320.24 examples/s] +Running tokenizer on dataset: 85%|████████▍ | 324000/382000 [17:15<03:02, 317.82 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:16<02:54, 321.07 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:16<02:35, 328.77 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:18<02:51, 325.77 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:18<02:48, 326.11 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:18<02:47, 321.83 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:18<02:43, 323.70 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 325000/382000 [17:18<02:56, 323.19 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:19<02:52, 319.01 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:19<02:33, 326.67 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:21<02:44, 328.78 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:21<02:49, 325.06 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:21<02:45, 320.33 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:21<02:41, 321.46 examples/s] +Running tokenizer on dataset: 85%|████████▌ | 326000/382000 [17:21<02:53, 323.28 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:22<02:29, 328.75 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:22<02:48, 321.24 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:24<02:45, 326.81 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:24<02:41, 327.59 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:24<02:37, 322.82 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:25<02:42, 319.70 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 327000/382000 [17:25<02:50, 322.65 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:25<02:26, 327.90 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:25<02:45, 320.84 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:27<02:43, 324.47 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:27<02:39, 326.08 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:28<02:38, 322.29 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 328000/382000 [17:28<02:46, 324.26 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:28<02:36, 319.84 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:28<02:24, 325.02 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:29<02:42, 320.72 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:30<02:35, 328.72 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:30<02:40, 323.14 examples/s] +Running tokenizer on dataset: 86%|████████▌ | 329000/382000 [17:31<02:43, 323.99 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:31<02:32, 321.94 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:31<02:35, 320.96 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:32<02:21, 325.16 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:32<02:37, 323.52 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:33<02:32, 327.30 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:33<02:36, 326.01 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:34<02:31, 322.41 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:34<02:29, 320.94 examples/s] +Running tokenizer on dataset: 86%|████████▋ | 330000/382000 [17:34<02:41, 322.20 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:35<02:18, 324.81 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:35<02:35, 321.44 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:36<02:28, 329.56 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:36<02:33, 325.99 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 331000/382000 [17:37<02:36, 325.17 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:37<02:28, 322.43 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:37<02:27, 317.99 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:38<02:14, 326.00 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:38<02:31, 322.61 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:39<02:25, 328.83 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:39<02:29, 328.21 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 332000/382000 [17:40<02:34, 323.86 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:40<02:28, 317.40 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:40<02:25, 315.66 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [17:41<02:12, 325.08 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:41<02:28, 323.08 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:42<02:23, 326.45 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:42<02:27, 326.35 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 333000/382000 [17:43<02:30, 326.11 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:43<02:25, 315.33 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:43<02:22, 316.84 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:44<02:26, 321.66 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [17:44<02:13, 315.07 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:45<02:21, 325.29 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:45<02:25, 323.85 examples/s] +Running tokenizer on dataset: 87%|████████▋ | 334000/382000 [17:46<02:27, 325.68 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:47<02:22, 315.51 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:47<02:18, 316.65 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:47<02:23, 319.85 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [17:47<02:10, 313.72 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:48<02:17, 326.59 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:49<02:23, 321.04 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 335000/382000 [17:49<02:24, 324.92 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [17:50<02:15, 316.56 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:50<02:20, 313.54 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:50<02:20, 319.42 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [17:51<02:08, 310.14 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:51<02:13, 328.76 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:52<02:19, 322.42 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 336000/382000 [17:52<02:21, 325.28 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [17:53<02:16, 313.87 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [17:53<02:16, 307.07 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:54<02:19, 315.19 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [17:54<02:06, 309.06 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [17:54<02:10, 329.83 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:55<02:16, 322.67 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 337000/382000 [17:55<02:18, 324.72 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [17:56<02:11, 311.48 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [17:56<02:17, 305.31 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [17:57<02:16, 315.87 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [17:57<02:03, 308.88 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [17:58<02:10, 320.96 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [17:58<02:13, 322.95 examples/s] +Running tokenizer on dataset: 88%|████████▊ | 338000/382000 [17:58<02:15, 324.39 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [17:59<02:07, 313.22 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [18:00<02:12, 310.23 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [18:00<02:16, 307.64 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:00<01:59, 310.02 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [18:01<02:06, 322.96 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [18:01<02:13, 315.27 examples/s] +Running tokenizer on dataset: 89%|████████▊ | 339000/382000 [18:02<02:12, 323.74 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [18:03<02:03, 314.87 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [18:03<02:08, 311.67 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [18:03<02:10, 313.42 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:03<01:54, 313.12 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [18:04<02:04, 322.14 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [18:04<02:08, 318.48 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 340000/382000 [18:05<02:13, 314.44 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [18:06<02:04, 313.24 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [18:06<02:01, 312.49 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [18:06<02:06, 314.97 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:06<01:49, 318.91 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [18:07<02:00, 323.90 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [18:07<02:04, 320.75 examples/s] +Running tokenizer on dataset: 89%|████████▉ | 341000/382000 [18:08<02:08, 318.42 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:09<01:57, 313.88 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [18:09<02:01, 312.04 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [18:10<02:03, 314.53 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:10<01:46, 318.74 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [18:10<01:58, 321.48 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [18:11<02:01, 321.33 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 342000/382000 [18:11<02:05, 318.73 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:12<01:54, 313.62 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:12<01:58, 311.77 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:13<01:43, 319.14 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [18:13<02:01, 312.78 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:13<01:54, 322.62 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [18:14<01:59, 318.56 examples/s] +Running tokenizer on dataset: 90%|████████▉ | 343000/382000 [18:14<02:01, 320.59 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:15<01:49, 318.57 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:15<01:55, 311.83 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:16<01:39, 320.66 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:16<01:57, 313.98 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:16<01:51, 321.87 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:17<01:56, 317.89 examples/s] +Running tokenizer on dataset: 90%|█████████ | 344000/382000 [18:17<01:59, 317.59 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:18<01:46, 318.45 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:19<01:50, 316.46 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:19<01:36, 321.99 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:19<01:54, 315.07 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:19<01:47, 326.99 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:20<01:53, 317.70 examples/s] +Running tokenizer on dataset: 90%|█████████ | 345000/382000 [18:21<01:56, 317.77 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:22<01:44, 314.86 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:22<01:47, 316.97 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:22<01:33, 321.58 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:22<01:49, 318.63 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:22<01:43, 327.07 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:23<01:48, 322.33 examples/s] +Running tokenizer on dataset: 91%|█████████ | 346000/382000 [18:24<01:53, 317.79 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:25<01:41, 314.52 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:25<01:44, 315.91 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:25<01:29, 323.49 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:25<01:46, 318.33 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:25<01:41, 325.63 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:26<01:45, 323.62 examples/s] +Running tokenizer on dataset: 91%|█████████ | 347000/382000 [18:27<01:48, 321.76 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:28<01:38, 314.52 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:28<01:41, 315.58 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:28<01:26, 322.87 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:28<01:37, 326.72 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:28<01:43, 317.96 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:29<01:42, 322.87 examples/s] +Running tokenizer on dataset: 91%|█████████ | 348000/382000 [18:30<01:45, 322.27 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:31<01:35, 312.76 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:31<01:38, 314.02 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:31<01:23, 322.95 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:32<01:35, 324.72 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:32<01:41, 314.29 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:32<01:39, 321.65 examples/s] +Running tokenizer on dataset: 91%|█████████▏| 349000/382000 [18:33<01:42, 320.72 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:34<01:31, 317.01 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:34<01:19, 325.28 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:34<01:35, 314.77 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:35<01:32, 324.41 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:35<01:38, 313.50 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:35<01:36, 320.52 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 350000/382000 [18:36<01:39, 320.79 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:37<01:16, 324.73 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:37<01:28, 316.33 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:38<01:31, 317.53 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:38<01:28, 325.99 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:38<01:36, 311.61 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:39<01:33, 319.58 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 351000/382000 [18:39<01:37, 318.13 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [18:41<01:14, 323.10 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:41<01:25, 316.18 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:41<01:28, 316.34 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:41<01:26, 325.17 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:41<01:32, 315.12 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:42<01:29, 323.58 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 352000/382000 [18:42<01:34, 318.05 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [18:44<01:11, 323.45 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:44<01:21, 319.42 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:44<01:22, 325.44 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:44<01:25, 315.88 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:45<01:29, 313.04 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:45<01:26, 323.00 examples/s] +Running tokenizer on dataset: 92%|█████████▏| 353000/382000 [18:45<01:30, 321.93 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [18:47<01:08, 323.43 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:47<01:18, 320.28 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:47<01:19, 327.29 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:47<01:21, 318.16 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:48<01:25, 314.37 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:48<01:23, 322.87 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 354000/382000 [18:49<01:27, 321.01 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [18:50<01:04, 326.27 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:50<01:16, 327.05 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [18:50<01:15, 316.79 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:50<01:18, 319.37 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:51<01:22, 316.92 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:51<01:20, 324.47 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 355000/382000 [18:52<01:24, 320.08 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [18:53<01:01, 324.97 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [18:53<01:13, 324.45 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [18:53<01:12, 316.98 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [18:53<01:15, 316.81 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:54<01:18, 316.61 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:54<01:17, 323.78 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 356000/382000 [18:55<01:20, 321.62 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [18:56<00:58, 324.03 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [18:56<01:10, 324.66 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [18:56<01:09, 316.45 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [18:56<01:12, 317.39 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [18:57<01:15, 317.16 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [18:57<01:14, 322.10 examples/s] +Running tokenizer on dataset: 93%|█████████▎| 357000/382000 [18:58<01:17, 321.35 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [18:59<00:55, 321.74 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [18:59<01:07, 325.24 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [18:59<01:05, 319.03 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [19:00<01:09, 315.90 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [19:00<01:11, 322.88 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [19:00<01:12, 316.77 examples/s] +Running tokenizer on dataset: 94%|█████████▎| 358000/382000 [19:01<01:14, 320.00 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:02<00:52, 322.69 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [19:02<01:03, 328.28 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [19:03<01:02, 319.02 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [19:03<01:05, 318.87 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [19:03<01:08, 322.71 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [19:03<01:09, 316.29 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 359000/382000 [19:04<01:11, 321.42 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:05<00:49, 323.93 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [19:05<01:00, 328.55 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [19:06<00:59, 318.76 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [19:06<01:02, 320.44 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [19:06<01:04, 324.70 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [19:06<01:05, 320.88 examples/s] +Running tokenizer on dataset: 94%|█████████▍| 360000/382000 [19:07<01:08, 320.29 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:08<00:46, 324.21 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [19:08<00:58, 325.63 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [19:09<00:56, 317.83 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [19:09<00:59, 318.08 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [19:09<01:01, 325.59 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [19:10<01:02, 322.43 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 361000/382000 [19:10<01:05, 322.52 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:11<00:43, 325.07 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [19:12<00:55, 323.86 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:12<00:53, 318.39 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [19:12<00:56, 316.01 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [19:13<00:58, 323.41 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [19:13<00:59, 321.05 examples/s] +Running tokenizer on dataset: 95%|█████████▍| 362000/382000 [19:13<01:02, 322.38 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:14<00:40, 323.83 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:15<00:52, 324.35 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:15<00:50, 318.17 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:15<00:53, 316.06 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [19:16<00:55, 322.02 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [19:16<00:56, 317.81 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 363000/382000 [19:17<00:59, 321.09 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:18<00:36, 325.25 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:18<00:49, 326.13 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:18<00:47, 318.83 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:18<00:50, 317.56 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:19<00:52, 323.93 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:19<00:53, 319.25 examples/s] +Running tokenizer on dataset: 95%|█████████▌| 364000/382000 [19:20<00:56, 319.86 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:21<00:45, 327.26 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:21<00:33, 323.60 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:21<00:44, 317.07 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:22<00:47, 318.60 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:22<00:49, 322.57 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:22<00:50, 319.65 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 365000/382000 [19:23<00:52, 320.91 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:24<00:30, 326.51 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:24<00:42, 327.68 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:25<00:40, 318.21 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:25<00:43, 318.50 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:25<00:46, 323.46 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:25<00:46, 321.80 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 366000/382000 [19:26<00:49, 320.91 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:27<00:39, 326.34 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:27<00:27, 324.48 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:28<00:37, 319.65 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:28<00:40, 318.05 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:28<00:43, 323.42 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:28<00:43, 321.70 examples/s] +Running tokenizer on dataset: 96%|█████████▌| 367000/382000 [19:29<00:46, 321.85 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:30<00:36, 328.06 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:30<00:24, 323.60 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:31<00:34, 317.43 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:31<00:37, 319.22 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:31<00:40, 324.11 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:31<00:40, 320.81 examples/s] +Running tokenizer on dataset: 96%|█████████▋| 368000/382000 [19:32<00:43, 322.68 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:33<00:33, 326.65 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:33<00:21, 321.99 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:34<00:31, 320.66 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:34<00:36, 325.96 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:34<00:34, 318.12 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:34<00:37, 322.00 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 369000/382000 [19:35<00:40, 322.45 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:36<00:30, 327.01 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:36<00:18, 323.61 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:37<00:28, 321.02 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:37<00:33, 324.31 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:37<00:31, 320.26 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:38<00:34, 320.63 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 370000/382000 [19:38<00:37, 323.42 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:39<00:27, 325.50 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [19:39<00:15, 323.02 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:40<00:24, 321.20 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:40<00:30, 325.58 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:40<00:28, 318.59 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:41<00:31, 320.64 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 371000/382000 [19:41<00:34, 321.31 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:42<00:24, 325.51 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [19:42<00:12, 323.51 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:43<00:21, 318.31 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:43<00:27, 323.04 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:44<00:25, 317.16 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:44<00:28, 319.35 examples/s] +Running tokenizer on dataset: 97%|█████████▋| 372000/382000 [19:44<00:30, 323.06 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:45<00:21, 324.93 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [19:45<00:09, 324.20 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:46<00:18, 319.76 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:47<00:24, 322.84 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:47<00:22, 316.41 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:47<00:25, 319.77 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 373000/382000 [19:48<00:27, 321.73 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:48<00:18, 326.99 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [19:48<00:06, 325.25 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [19:50<00:15, 320.68 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:50<00:21, 321.74 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:50<00:18, 318.10 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:50<00:22, 315.20 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 374000/382000 [19:51<00:24, 320.82 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [19:51<00:15, 325.97 examples/s] +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [19:52<00:03, 323.96 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [19:53<00:12, 319.97 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:53<00:18, 322.73 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [19:53<00:15, 317.67 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:53<00:18, 315.98 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 375000/382000 [19:54<00:21, 320.02 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [19:54<00:12, 325.60 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [19:55<00:00, 324.05 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [19:55<00:00, 319.48 examples/s] + +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [19:56<00:09, 320.05 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [19:56<00:15, 322.29 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [19:56<00:12, 316.82 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [19:57<00:15, 315.76 examples/s] +Running tokenizer on dataset: 98%|█████████▊| 376000/382000 [19:57<00:18, 320.73 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [19:58<00:09, 323.59 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [19:59<00:06, 319.11 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [19:59<00:12, 321.21 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [19:59<00:09, 315.25 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [20:00<00:12, 315.20 examples/s] +Running tokenizer on dataset: 99%|█████████▊| 377000/382000 [20:00<00:15, 318.80 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [20:01<00:06, 322.51 examples/s] +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [20:02<00:03, 318.79 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [20:02<00:09, 319.28 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [20:03<00:06, 316.52 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [20:03<00:09, 313.86 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 378000/382000 [20:03<00:12, 317.29 examples/s] +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [20:04<00:03, 321.38 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:05<00:00, 317.47 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:05<00:00, 316.80 examples/s] + +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [20:05<00:06, 318.20 examples/s] +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [20:06<00:03, 315.63 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [20:06<00:06, 313.62 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 379000/382000 [20:07<00:09, 315.23 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:07<00:00, 319.26 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:07<00:00, 316.36 examples/s] + +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [20:09<00:03, 316.68 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:09<00:00, 312.94 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:09<00:00, 315.82 examples/s] + +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [20:09<00:03, 313.32 examples/s] +Running tokenizer on dataset: 99%|█████████▉| 380000/382000 [20:10<00:06, 316.21 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:12<00:00, 315.59 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:12<00:00, 315.12 examples/s] + +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:13<00:00, 311.01 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:13<00:00, 314.86 examples/s] + +Running tokenizer on dataset: 100%|█████████▉| 381000/382000 [20:13<00:03, 317.28 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:16<00:00, 318.06 examples/s] +Running tokenizer on dataset: 100%|██████████| 382000/382000 [20:16<00:00, 314.01 examples/s] +[INFO|trainer.py:1760] 2023-11-12 03:23:45,386 >> ***** Running training ***** +[INFO|trainer.py:1761] 2023-11-12 03:23:45,387 >> Num examples = 382,000 +[INFO|trainer.py:1762] 2023-11-12 03:23:45,387 >> Num Epochs = 3 +[INFO|trainer.py:1763] 2023-11-12 03:23:45,387 >> Instantaneous batch size per device = 4 +[INFO|trainer.py:1766] 2023-11-12 03:23:45,387 >> Total train batch size (w. parallel, distributed & accumulation) = 256 +[INFO|trainer.py:1767] 2023-11-12 03:23:45,387 >> Gradient Accumulation steps = 8 +[INFO|trainer.py:1768] 2023-11-12 03:23:45,387 >> Total optimization steps = 4,476 +[INFO|trainer.py:1769] 2023-11-12 03:23:45,390 >> Number of trainable parameters = 1,949,696 + + 0%| | 0/4476 [00:00> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-200 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 04:41:34,371 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 04:41:34,372 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 04:41:34,372 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-200/added_tokens.json + + 4%|▍ | 201/4476 [1:18:10<27:42:08, 23.33s/it] + 5%|▍ | 202/4476 [1:18:33<27:46:04, 23.39s/it] + 5%|▍ | 203/4476 [1:18:57<27:39:00, 23.30s/it] + 5%|▍ | 204/4476 [1:19:20<27:34:29, 23.24s/it] + 5%|▍ | 205/4476 [1:19:43<27:37:53, 23.29s/it] + 5%|▍ | 206/4476 [1:20:06<27:37:16, 23.29s/it] + 5%|▍ | 207/4476 [1:20:30<27:45:27, 23.41s/it] + 5%|▍ | 208/4476 [1:20:53<27:39:53, 23.33s/it] + 5%|▍ | 209/4476 [1:21:16<27:34:03, 23.26s/it] + 5%|▍ | 210/4476 [1:21:40<27:38:03, 23.32s/it] + +{'loss': 0.5101, 'learning_rate': 4.972892978596069e-05, 'epoch': 0.14} + + 5%|▍ | 210/4476 [1:21:40<27:38:03, 23.32s/it] + 5%|▍ | 211/4476 [1:22:03<27:36:13, 23.30s/it] + 5%|▍ | 212/4476 [1:22:26<27:36:22, 23.31s/it] + 5%|▍ | 213/4476 [1:22:50<27:35:08, 23.30s/it] + 5%|▍ | 214/4476 [1:23:13<27:30:38, 23.24s/it] + 5%|▍ | 215/4476 [1:23:36<27:33:36, 23.28s/it] + 5%|▍ | 216/4476 [1:23:59<27:34:54, 23.31s/it] + 5%|▍ | 217/4476 [1:24:23<27:44:44, 23.45s/it] + 5%|▍ | 218/4476 [1:24:46<27:37:32, 23.36s/it] + 5%|▍ | 219/4476 [1:25:09<27:29:48, 23.25s/it] + 5%|▍ | 220/4476 [1:25:33<27:34:44, 23.33s/it] + +{'loss': 0.5125, 'learning_rate': 4.970255143517838e-05, 'epoch': 0.15} + + 5%|▍ | 220/4476 [1:25:33<27:34:44, 23.33s/it] + 5%|▍ | 221/4476 [1:25:56<27:39:15, 23.40s/it] + 5%|▍ | 222/4476 [1:26:20<27:43:12, 23.46s/it] + 5%|▍ | 223/4476 [1:26:43<27:39:00, 23.40s/it] + 5%|▌ | 224/4476 [1:27:07<27:34:53, 23.35s/it] + 5%|▌ | 225/4476 [1:27:29<27:26:02, 23.23s/it] + 5%|▌ | 226/4476 [1:27:53<27:21:25, 23.17s/it] + 5%|▌ | 227/4476 [1:28:16<27:23:04, 23.20s/it] + 5%|▌ | 228/4476 [1:28:39<27:21:39, 23.19s/it] + 5%|▌ | 229/4476 [1:29:02<27:26:17, 23.26s/it] + 5%|▌ | 230/4476 [1:29:26<27:29:14, 23.31s/it] + +{'loss': 0.4928, 'learning_rate': 4.967495617114826e-05, 'epoch': 0.15} + + 5%|▌ | 230/4476 [1:29:26<27:29:14, 23.31s/it] + 5%|▌ | 231/4476 [1:29:49<27:24:13, 23.24s/it] + 5%|▌ | 232/4476 [1:30:13<27:33:14, 23.37s/it] + 5%|▌ | 233/4476 [1:30:36<27:38:20, 23.45s/it] + 5%|▌ | 234/4476 [1:31:00<27:39:44, 23.48s/it] + 5%|▌ | 235/4476 [1:31:23<27:29:54, 23.34s/it] + 5%|▌ | 236/4476 [1:31:46<27:27:43, 23.32s/it] + 5%|▌ | 237/4476 [1:32:09<27:26:27, 23.30s/it] + 5%|▌ | 238/4476 [1:32:32<27:21:40, 23.24s/it] + 5%|▌ | 239/4476 [1:32:56<27:28:34, 23.35s/it] + 5%|▌ | 240/4476 [1:33:19<27:26:20, 23.32s/it] + +{'loss': 0.4878, 'learning_rate': 4.964614535328626e-05, 'epoch': 0.16} + + 5%|▌ | 240/4476 [1:33:19<27:26:20, 23.32s/it] + 5%|▌ | 241/4476 [1:33:43<27:33:03, 23.42s/it] + 5%|▌ | 242/4476 [1:34:06<27:19:22, 23.23s/it] + 5%|▌ | 243/4476 [1:34:29<27:20:32, 23.25s/it] + 5%|▌ | 244/4476 [1:34:52<27:25:15, 23.33s/it] + 5%|▌ | 245/4476 [1:35:16<27:24:01, 23.31s/it] + 5%|▌ | 246/4476 [1:35:40<27:33:40, 23.46s/it] + 6%|▌ | 247/4476 [1:36:03<27:28:35, 23.39s/it] + 6%|▌ | 248/4476 [1:36:26<27:27:10, 23.38s/it] + 6%|▌ | 249/4476 [1:36:49<27:19:06, 23.27s/it] + 6%|▌ | 250/4476 [1:37:13<27:21:35, 23.31s/it] + +{'loss': 0.5017, 'learning_rate': 4.961612040088973e-05, 'epoch': 0.17} + + 6%|▌ | 250/4476 [1:37:13<27:21:35, 23.31s/it] + 6%|▌ | 251/4476 [1:37:36<27:22:20, 23.32s/it] + 6%|▌ | 252/4476 [1:37:59<27:25:39, 23.38s/it] + 6%|▌ | 253/4476 [1:38:23<27:27:44, 23.41s/it] + 6%|▌ | 254/4476 [1:38:46<27:31:09, 23.47s/it] + 6%|▌ | 255/4476 [1:39:10<27:31:13, 23.47s/it] + 6%|▌ | 256/4476 [1:39:33<27:30:32, 23.47s/it] + 6%|▌ | 257/4476 [1:39:57<27:30:55, 23.48s/it] + 6%|▌ | 258/4476 [1:40:20<27:20:51, 23.34s/it] + 6%|▌ | 259/4476 [1:40:43<27:23:39, 23.39s/it] + 6%|▌ | 260/4476 [1:41:07<27:20:34, 23.35s/it] + +{'loss': 0.4863, 'learning_rate': 4.9584882793067534e-05, 'epoch': 0.17} + + 6%|▌ | 260/4476 [1:41:07<27:20:34, 23.35s/it] + 6%|▌ | 261/4476 [1:41:30<27:21:10, 23.36s/it] + 6%|▌ | 262/4476 [1:41:53<27:13:54, 23.26s/it] + 6%|▌ | 263/4476 [1:42:17<27:21:29, 23.38s/it] + 6%|▌ | 264/4476 [1:42:40<27:23:36, 23.41s/it] + 6%|▌ | 265/4476 [1:43:03<27:18:17, 23.34s/it] + 6%|▌ | 266/4476 [1:43:27<27:17:35, 23.34s/it] + 6%|▌ | 267/4476 [1:43:50<27:15:28, 23.31s/it] + 6%|▌ | 268/4476 [1:44:13<26:59:10, 23.09s/it] + 6%|▌ | 269/4476 [1:44:36<27:04:36, 23.17s/it] + 6%|▌ | 270/4476 [1:45:00<27:14:49, 23.32s/it] + +{'loss': 0.4847, 'learning_rate': 4.955243406866713e-05, 'epoch': 0.18} + + 6%|▌ | 270/4476 [1:45:00<27:14:49, 23.32s/it] + 6%|▌ | 271/4476 [1:45:23<27:12:42, 23.30s/it] + 6%|▌ | 272/4476 [1:45:46<27:08:23, 23.24s/it] + 6%|▌ | 273/4476 [1:46:09<27:12:06, 23.30s/it] + 6%|▌ | 274/4476 [1:46:33<27:07:55, 23.25s/it] + 6%|▌ | 275/4476 [1:46:56<27:10:15, 23.28s/it] + 6%|▌ | 276/4476 [1:47:19<27:12:05, 23.32s/it] + 6%|▌ | 277/4476 [1:47:43<27:12:00, 23.32s/it] + 6%|▌ | 278/4476 [1:48:06<27:14:37, 23.36s/it] + 6%|▌ | 279/4476 [1:48:30<27:16:21, 23.39s/it] + 6%|▋ | 280/4476 [1:48:53<27:17:44, 23.42s/it] + +{'loss': 0.4868, 'learning_rate': 4.951877582619881e-05, 'epoch': 0.19} + + 6%|▋ | 280/4476 [1:48:53<27:17:44, 23.42s/it] + 6%|▋ | 281/4476 [1:49:16<27:10:24, 23.32s/it] + 6%|▋ | 282/4476 [1:49:39<27:05:33, 23.26s/it] + 6%|▋ | 283/4476 [1:50:03<27:16:06, 23.41s/it] + 6%|▋ | 284/4476 [1:50:26<27:10:03, 23.33s/it] + 6%|▋ | 285/4476 [1:50:49<27:09:33, 23.33s/it] + 6%|▋ | 286/4476 [1:51:13<27:10:11, 23.34s/it] + 6%|▋ | 287/4476 [1:51:37<27:19:56, 23.49s/it] + 6%|▋ | 288/4476 [1:52:00<27:14:07, 23.41s/it] + 6%|▋ | 289/4476 [1:52:23<27:09:17, 23.35s/it] + 6%|▋ | 290/4476 [1:52:46<27:05:40, 23.30s/it] + +{'loss': 0.4748, 'learning_rate': 4.948390972375694e-05, 'epoch': 0.19} + + 6%|▋ | 290/4476 [1:52:46<27:05:40, 23.30s/it] + 7%|▋ | 291/4476 [1:53:10<27:06:50, 23.32s/it] + 7%|▋ | 292/4476 [1:53:33<27:06:48, 23.33s/it] + 7%|▋ | 293/4476 [1:53:57<27:10:04, 23.38s/it] + 7%|▋ | 294/4476 [1:54:20<27:16:30, 23.48s/it] + 7%|▋ | 295/4476 [1:54:44<27:17:19, 23.50s/it] + 7%|▋ | 296/4476 [1:55:07<27:10:16, 23.40s/it] + 7%|▋ | 297/4476 [1:55:30<27:10:20, 23.41s/it] + 7%|▋ | 298/4476 [1:55:54<27:07:44, 23.38s/it] + 7%|▋ | 299/4476 [1:56:17<27:11:11, 23.43s/it] + 7%|▋ | 300/4476 [1:56:41<27:09:05, 23.41s/it] + +{'loss': 0.4764, 'learning_rate': 4.944783747893825e-05, 'epoch': 0.2} + + 7%|▋ | 300/4476 [1:56:41<27:09:05, 23.41s/it] + 7%|▋ | 301/4476 [1:57:04<27:15:01, 23.50s/it] + 7%|▋ | 302/4476 [1:57:28<27:10:46, 23.44s/it] + 7%|▋ | 303/4476 [1:57:51<27:09:00, 23.42s/it] + 7%|▋ | 304/4476 [1:58:15<27:11:38, 23.47s/it] + 7%|▋ | 305/4476 [1:58:38<27:09:45, 23.44s/it] + 7%|▋ | 306/4476 [1:59:01<27:04:55, 23.38s/it] + 7%|▋ | 307/4476 [1:59:25<27:03:53, 23.37s/it] + 7%|▋ | 308/4476 [1:59:48<27:07:49, 23.43s/it] + 7%|▋ | 309/4476 [2:00:12<27:11:55, 23.50s/it] + 7%|▋ | 310/4476 [2:00:35<27:15:34, 23.56s/it] + +{'loss': 0.4712, 'learning_rate': 4.941056086875727e-05, 'epoch': 0.21} + + 7%|▋ | 310/4476 [2:00:35<27:15:34, 23.56s/it] + 7%|▋ | 311/4476 [2:00:59<27:14:08, 23.54s/it] + 7%|▋ | 312/4476 [2:01:22<27:12:13, 23.52s/it] + 7%|▋ | 313/4476 [2:01:46<27:10:08, 23.49s/it] + 7%|▋ | 314/4476 [2:02:09<27:05:44, 23.44s/it] + 7%|▋ | 315/4476 [2:02:32<27:02:00, 23.39s/it] + 7%|▋ | 316/4476 [2:02:56<26:58:13, 23.34s/it] + 7%|▋ | 317/4476 [2:03:19<26:57:05, 23.33s/it] + 7%|▋ | 318/4476 [2:03:42<26:53:40, 23.29s/it] + 7%|▋ | 319/4476 [2:04:06<26:56:18, 23.33s/it] + 7%|▋ | 320/4476 [2:04:29<26:58:26, 23.37s/it] + +{'loss': 0.4642, 'learning_rate': 4.937208172955876e-05, 'epoch': 0.21} + + 7%|▋ | 320/4476 [2:04:29<26:58:26, 23.37s/it] + 7%|▋ | 321/4476 [2:04:52<26:56:37, 23.34s/it] + 7%|▋ | 322/4476 [2:05:16<26:58:27, 23.38s/it] + 7%|▋ | 323/4476 [2:05:39<27:01:34, 23.43s/it] + 7%|▋ | 324/4476 [2:06:02<26:52:40, 23.30s/it] + 7%|▋ | 325/4476 [2:06:25<26:48:59, 23.26s/it] + 7%|▋ | 326/4476 [2:06:49<26:58:25, 23.40s/it] + 7%|▋ | 327/4476 [2:07:13<26:59:01, 23.41s/it] + 7%|▋ | 328/4476 [2:07:36<26:59:02, 23.42s/it] + 7%|▋ | 329/4476 [2:08:00<27:00:47, 23.45s/it] + 7%|▋ | 330/4476 [2:08:23<26:57:05, 23.40s/it] + +{'loss': 0.4642, 'learning_rate': 4.9332401956927224e-05, 'epoch': 0.22} + + 7%|▋ | 330/4476 [2:08:23<26:57:05, 23.40s/it] + 7%|▋ | 331/4476 [2:08:47<27:05:27, 23.53s/it] + 7%|▋ | 332/4476 [2:09:10<27:05:41, 23.54s/it] + 7%|▋ | 333/4476 [2:09:34<27:03:08, 23.51s/it] + 7%|▋ | 334/4476 [2:09:57<26:57:46, 23.43s/it] + 7%|▋ | 335/4476 [2:10:20<26:54:21, 23.39s/it] + 8%|▊ | 336/4476 [2:10:43<26:47:32, 23.30s/it] + 8%|▊ | 337/4476 [2:11:06<26:42:13, 23.23s/it] + 8%|▊ | 338/4476 [2:11:29<26:33:29, 23.11s/it] + 8%|▊ | 339/4476 [2:11:53<26:36:59, 23.16s/it] + 8%|▊ | 340/4476 [2:12:16<26:35:51, 23.15s/it] + +{'loss': 0.4709, 'learning_rate': 4.9291523505593604e-05, 'epoch': 0.23} + + 8%|▊ | 340/4476 [2:12:16<26:35:51, 23.15s/it] + 8%|▊ | 341/4476 [2:12:39<26:30:23, 23.08s/it] + 8%|▊ | 342/4476 [2:13:02<26:37:03, 23.18s/it] + 8%|▊ | 343/4476 [2:13:25<26:40:22, 23.23s/it] + 8%|▊ | 344/4476 [2:13:49<26:43:16, 23.28s/it] + 8%|▊ | 345/4476 [2:14:12<26:41:38, 23.26s/it] + 8%|▊ | 346/4476 [2:14:36<26:49:08, 23.38s/it] + 8%|▊ | 347/4476 [2:14:59<26:51:18, 23.41s/it] + 8%|▊ | 348/4476 [2:15:22<26:47:53, 23.37s/it] + 8%|▊ | 349/4476 [2:15:46<26:51:16, 23.43s/it] + 8%|▊ | 350/4476 [2:16:09<26:49:53, 23.41s/it] + +{'loss': 0.461, 'learning_rate': 4.9249448389338905e-05, 'epoch': 0.23} + + 8%|▊ | 350/4476 [2:16:09<26:49:53, 23.41s/it] + 8%|▊ | 351/4476 [2:16:33<26:53:21, 23.47s/it] + 8%|▊ | 352/4476 [2:16:56<26:47:37, 23.39s/it] + 8%|▊ | 353/4476 [2:17:19<26:39:45, 23.28s/it] + 8%|▊ | 354/4476 [2:17:42<26:39:51, 23.29s/it] + 8%|▊ | 355/4476 [2:18:06<26:48:17, 23.42s/it] + 8%|▊ | 356/4476 [2:18:30<26:55:32, 23.53s/it] + 8%|▊ | 357/4476 [2:18:53<26:49:35, 23.45s/it] + 8%|▊ | 358/4476 [2:19:17<26:49:15, 23.45s/it] + 8%|▊ | 359/4476 [2:19:40<26:55:47, 23.55s/it] + 8%|▊ | 360/4476 [2:20:04<26:49:49, 23.47s/it] + +{'loss': 0.4677, 'learning_rate': 4.920617868089501e-05, 'epoch': 0.24} + + 8%|▊ | 360/4476 [2:20:04<26:49:49, 23.47s/it] + 8%|▊ | 361/4476 [2:20:27<26:43:51, 23.39s/it] + 8%|▊ | 362/4476 [2:20:51<26:51:24, 23.50s/it] + 8%|▊ | 363/4476 [2:21:14<26:48:45, 23.47s/it] + 8%|▊ | 364/4476 [2:21:38<26:53:59, 23.55s/it] + 8%|▊ | 365/4476 [2:22:01<26:52:37, 23.54s/it] + 8%|▊ | 366/4476 [2:22:25<26:56:21, 23.60s/it] + 8%|▊ | 367/4476 [2:22:48<26:50:19, 23.51s/it] + 8%|▊ | 368/4476 [2:23:12<26:54:20, 23.58s/it] + 8%|▊ | 369/4476 [2:23:35<26:47:39, 23.49s/it] + 8%|▊ | 370/4476 [2:23:59<26:44:43, 23.45s/it] + +{'loss': 0.4564, 'learning_rate': 4.9161716511842614e-05, 'epoch': 0.25} + + 8%|▊ | 370/4476 [2:23:59<26:44:43, 23.45s/it] + 8%|▊ | 371/4476 [2:24:22<26:41:54, 23.41s/it] + 8%|▊ | 372/4476 [2:24:45<26:39:41, 23.39s/it] + 8%|▊ | 373/4476 [2:25:09<26:40:55, 23.41s/it] + 8%|▊ | 374/4476 [2:25:32<26:41:10, 23.42s/it] + 8%|▊ | 375/4476 [2:25:56<26:36:47, 23.36s/it] + 8%|▊ | 376/4476 [2:26:19<26:37:03, 23.37s/it] + 8%|▊ | 377/4476 [2:26:42<26:37:12, 23.38s/it] + 8%|▊ | 378/4476 [2:27:06<26:38:07, 23.40s/it] + 8%|▊ | 379/4476 [2:27:29<26:34:24, 23.35s/it] + 8%|▊ | 380/4476 [2:27:52<26:27:58, 23.26s/it] + +{'loss': 0.4663, 'learning_rate': 4.911606407250617e-05, 'epoch': 0.25} + + 8%|▊ | 380/4476 [2:27:52<26:27:58, 23.26s/it] + 9%|▊ | 381/4476 [2:28:16<26:34:55, 23.37s/it] + 9%|▊ | 382/4476 [2:28:39<26:33:09, 23.35s/it] + 9%|▊ | 383/4476 [2:29:02<26:27:08, 23.27s/it] + 9%|▊ | 384/4476 [2:29:25<26:25:53, 23.25s/it] + 9%|▊ | 385/4476 [2:29:48<26:18:18, 23.15s/it] + 9%|▊ | 386/4476 [2:30:11<26:19:40, 23.17s/it] + 9%|▊ | 387/4476 [2:30:35<26:26:28, 23.28s/it] + 9%|▊ | 388/4476 [2:30:58<26:31:41, 23.36s/it] + 9%|▊ | 389/4476 [2:31:22<26:29:45, 23.34s/it] + 9%|▊ | 390/4476 [2:31:45<26:31:58, 23.38s/it] + +{'loss': 0.4682, 'learning_rate': 4.9069223611846014e-05, 'epoch': 0.26} + + 9%|▊ | 390/4476 [2:31:45<26:31:58, 23.38s/it] + 9%|▊ | 391/4476 [2:32:08<26:25:41, 23.29s/it] + 9%|▉ | 392/4476 [2:32:31<26:20:01, 23.21s/it] + 9%|▉ | 393/4476 [2:32:55<26:24:30, 23.28s/it] + 9%|▉ | 394/4476 [2:33:19<26:33:00, 23.42s/it] + 9%|▉ | 395/4476 [2:33:42<26:30:29, 23.38s/it] + 9%|▉ | 396/4476 [2:34:05<26:23:48, 23.29s/it] + 9%|▉ | 397/4476 [2:34:28<26:20:25, 23.25s/it] + 9%|▉ | 398/4476 [2:34:52<26:24:59, 23.32s/it] + 9%|▉ | 399/4476 [2:35:15<26:21:54, 23.28s/it] + 9%|▉ | 400/4476 [2:35:38<26:15:48, 23.20s/it] + +{'loss': 0.4636, 'learning_rate': 4.9021197437347555e-05, 'epoch': 0.27} + + 9%|▉ | 400/4476 [2:35:38<26:15:48, 23.20s/it][INFO|trainer.py:2939] 2023-11-12 05:59:25,475 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-400 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 05:59:25,507 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 05:59:25,507 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-400/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 05:59:25,507 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-400/added_tokens.json + + 9%|▉ | 401/4476 [2:36:01<26:26:55, 23.37s/it] + 9%|▉ | 402/4476 [2:36:25<26:33:27, 23.47s/it] + 9%|▉ | 403/4476 [2:36:48<26:23:21, 23.32s/it] + 9%|▉ | 404/4476 [2:37:12<26:23:54, 23.34s/it] + 9%|▉ | 405/4476 [2:37:35<26:25:03, 23.36s/it] + 9%|▉ | 406/4476 [2:37:58<26:24:19, 23.36s/it] + 9%|▉ | 407/4476 [2:38:22<26:25:51, 23.38s/it] + 9%|▉ | 408/4476 [2:38:45<26:26:19, 23.40s/it] + 9%|▉ | 409/4476 [2:39:08<26:22:15, 23.34s/it] + 9%|▉ | 410/4476 [2:39:32<26:24:35, 23.38s/it] + +{'loss': 0.4569, 'learning_rate': 4.897198791490762e-05, 'epoch': 0.27} + + 9%|▉ | 410/4476 [2:39:32<26:24:35, 23.38s/it] + 9%|▉ | 411/4476 [2:39:56<26:30:09, 23.47s/it] + 9%|▉ | 412/4476 [2:40:19<26:27:15, 23.43s/it] + 9%|▉ | 413/4476 [2:40:42<26:17:33, 23.30s/it] + 9%|▉ | 414/4476 [2:41:05<26:17:09, 23.30s/it] + 9%|▉ | 415/4476 [2:41:28<26:14:59, 23.27s/it] + 9%|▉ | 416/4476 [2:41:52<26:22:44, 23.39s/it] + 9%|▉ | 417/4476 [2:42:16<26:23:50, 23.41s/it] + 9%|▉ | 418/4476 [2:42:39<26:21:46, 23.39s/it] + 9%|▉ | 419/4476 [2:43:02<26:18:28, 23.34s/it] + 9%|▉ | 420/4476 [2:43:25<26:17:45, 23.34s/it] + +{'loss': 0.462, 'learning_rate': 4.8921597468717887e-05, 'epoch': 0.28} + + 9%|▉ | 420/4476 [2:43:25<26:17:45, 23.34s/it] + 9%|▉ | 421/4476 [2:43:49<26:24:13, 23.44s/it] + 9%|▉ | 422/4476 [2:44:13<26:26:08, 23.48s/it] + 9%|▉ | 423/4476 [2:44:36<26:25:53, 23.48s/it] + 9%|▉ | 424/4476 [2:44:59<26:16:20, 23.34s/it] + 9%|▉ | 425/4476 [2:45:23<26:20:44, 23.41s/it] + 10%|▉ | 426/4476 [2:45:47<26:28:46, 23.54s/it] + 10%|▉ | 427/4476 [2:46:10<26:28:21, 23.54s/it] + 10%|▉ | 428/4476 [2:46:33<26:23:10, 23.47s/it] + 10%|▉ | 429/4476 [2:46:57<26:24:36, 23.49s/it] + 10%|▉ | 430/4476 [2:47:20<26:21:42, 23.46s/it] + +{'loss': 0.4563, 'learning_rate': 4.887002858114548e-05, 'epoch': 0.29} + + 10%|▉ | 430/4476 [2:47:20<26:21:42, 23.46s/it] + 10%|▉ | 431/4476 [2:47:44<26:19:14, 23.43s/it] + 10%|▉ | 432/4476 [2:48:07<26:15:47, 23.38s/it] + 10%|▉ | 433/4476 [2:48:30<26:14:31, 23.37s/it] + 10%|▉ | 434/4476 [2:48:54<26:20:53, 23.47s/it] + 10%|▉ | 435/4476 [2:49:18<26:22:14, 23.49s/it] + 10%|▉ | 436/4476 [2:49:41<26:17:25, 23.43s/it] + 10%|▉ | 437/4476 [2:50:04<26:17:22, 23.43s/it] + 10%|▉ | 438/4476 [2:50:27<26:10:14, 23.33s/it] + 10%|▉ | 439/4476 [2:50:51<26:09:03, 23.32s/it] + 10%|▉ | 440/4476 [2:51:14<26:07:38, 23.30s/it] + +{'loss': 0.4563, 'learning_rate': 4.881728379261068e-05, 'epoch': 0.29} + + 10%|▉ | 440/4476 [2:51:14<26:07:38, 23.30s/it] + 10%|▉ | 441/4476 [2:51:38<26:14:48, 23.42s/it] + 10%|▉ | 442/4476 [2:52:01<26:08:35, 23.33s/it] + 10%|▉ | 443/4476 [2:52:24<26:12:55, 23.40s/it] + 10%|▉ | 444/4476 [2:52:47<26:06:35, 23.31s/it] + 10%|▉ | 445/4476 [2:53:11<26:05:04, 23.30s/it] + 10%|▉ | 446/4476 [2:53:34<26:09:43, 23.37s/it] + 10%|▉ | 447/4476 [2:53:57<26:05:19, 23.31s/it] + 10%|█ | 448/4476 [2:54:21<26:08:10, 23.36s/it] + 10%|█ | 449/4476 [2:54:44<26:03:41, 23.30s/it] + 10%|█ | 450/4476 [2:55:07<25:59:08, 23.24s/it] + +{'loss': 0.4468, 'learning_rate': 4.876336570146175e-05, 'epoch': 0.3} + + 10%|█ | 450/4476 [2:55:07<25:59:08, 23.24s/it] + 10%|█ | 451/4476 [2:55:30<26:01:37, 23.28s/it] + 10%|█ | 452/4476 [2:55:54<26:00:57, 23.27s/it] + 10%|█ | 453/4476 [2:56:17<25:59:06, 23.25s/it] + 10%|█ | 454/4476 [2:56:40<26:00:53, 23.29s/it] + 10%|█ | 455/4476 [2:57:04<26:01:33, 23.30s/it] + 10%|█ | 456/4476 [2:57:27<26:03:03, 23.33s/it] + 10%|█ | 457/4476 [2:57:50<26:05:00, 23.36s/it] + 10%|█ | 458/4476 [2:58:14<25:58:24, 23.27s/it] + 10%|█ | 459/4476 [2:58:37<26:01:40, 23.33s/it] + 10%|█ | 460/4476 [2:59:01<26:05:50, 23.39s/it] + +{'loss': 0.4508, 'learning_rate': 4.870827696384698e-05, 'epoch': 0.31} + + 10%|█ | 460/4476 [2:59:01<26:05:50, 23.39s/it] + 10%|█ | 461/4476 [2:59:24<26:06:49, 23.41s/it] + 10%|█ | 462/4476 [2:59:47<26:05:21, 23.40s/it] + 10%|█ | 463/4476 [3:00:11<26:06:00, 23.41s/it] + 10%|█ | 464/4476 [3:00:34<26:05:41, 23.42s/it] + 10%|█ | 465/4476 [3:00:58<26:07:45, 23.45s/it] + 10%|█ | 466/4476 [3:01:21<26:06:23, 23.44s/it] + 10%|█ | 467/4476 [3:01:44<25:58:01, 23.32s/it] + 10%|█ | 468/4476 [3:02:07<25:55:52, 23.29s/it] + 10%|█ | 469/4476 [3:02:31<25:57:51, 23.33s/it] + 11%|█ | 470/4476 [3:02:54<25:56:18, 23.31s/it] + +{'loss': 0.4507, 'learning_rate': 4.865202029358379e-05, 'epoch': 0.31} + + 11%|█ | 470/4476 [3:02:54<25:56:18, 23.31s/it] + 11%|█ | 471/4476 [3:03:17<25:54:52, 23.29s/it] + 11%|█ | 472/4476 [3:03:41<26:04:02, 23.44s/it] + 11%|█ | 473/4476 [3:04:05<26:09:01, 23.52s/it] + 11%|█ | 474/4476 [3:04:29<26:11:00, 23.55s/it] + 11%|█ | 475/4476 [3:04:52<26:15:28, 23.63s/it] + 11%|█ | 476/4476 [3:05:15<26:04:12, 23.46s/it] + 11%|█ | 477/4476 [3:05:39<25:57:51, 23.37s/it] + 11%|█ | 478/4476 [3:06:02<25:53:35, 23.32s/it] + 11%|█ | 479/4476 [3:06:25<25:58:03, 23.39s/it] + 11%|█ | 480/4476 [3:06:49<25:56:46, 23.37s/it] + +{'loss': 0.4486, 'learning_rate': 4.859459846202507e-05, 'epoch': 0.32} + + 11%|█ | 480/4476 [3:06:49<25:56:46, 23.37s/it] + 11%|█ | 481/4476 [3:07:12<25:51:03, 23.29s/it] + 11%|█ | 482/4476 [3:07:35<25:47:57, 23.25s/it] + 11%|█ | 483/4476 [3:07:58<25:50:13, 23.29s/it] + 11%|█ | 484/4476 [3:08:22<25:51:52, 23.32s/it] + 11%|█ | 485/4476 [3:08:45<25:47:48, 23.27s/it] + 11%|█ | 486/4476 [3:09:08<25:45:46, 23.24s/it] + 11%|█ | 487/4476 [3:09:31<25:44:49, 23.24s/it] + 11%|█ | 488/4476 [3:09:55<25:48:01, 23.29s/it] + 11%|█ | 489/4476 [3:10:18<25:39:50, 23.17s/it] + 11%|█ | 490/4476 [3:10:41<25:41:42, 23.21s/it] + +{'loss': 0.4423, 'learning_rate': 4.853601429792265e-05, 'epoch': 0.33} + + 11%|█ | 490/4476 [3:10:41<25:41:42, 23.21s/it] + 11%|█ | 491/4476 [3:11:04<25:39:06, 23.17s/it] + 11%|█ | 492/4476 [3:11:27<25:38:49, 23.18s/it] + 11%|█ | 493/4476 [3:11:50<25:39:06, 23.19s/it] + 11%|█ | 494/4476 [3:12:14<25:39:58, 23.20s/it] + 11%|█ | 495/4476 [3:12:37<25:40:33, 23.22s/it] + 11%|█ | 496/4476 [3:13:00<25:35:46, 23.15s/it] + 11%|█ | 497/4476 [3:13:23<25:40:41, 23.23s/it] + 11%|█ | 498/4476 [3:13:47<25:45:52, 23.32s/it] + 11%|█ | 499/4476 [3:14:10<25:45:16, 23.31s/it] + 11%|█ | 500/4476 [3:14:34<25:49:12, 23.38s/it] + +{'loss': 0.4369, 'learning_rate': 4.847627068728795e-05, 'epoch': 0.34} + + 11%|█ | 500/4476 [3:14:34<25:49:12, 23.38s/it] + 11%|█ | 501/4476 [3:14:57<25:47:56, 23.37s/it] + 11%|█ | 502/4476 [3:15:20<25:50:26, 23.41s/it] + 11%|█ | 503/4476 [3:15:44<25:47:51, 23.38s/it] + 11%|█▏ | 504/4476 [3:16:07<25:52:02, 23.44s/it] + 11%|█▏ | 505/4476 [3:16:31<25:47:29, 23.38s/it] + 11%|█▏ | 506/4476 [3:16:54<25:42:12, 23.31s/it] + 11%|█▏ | 507/4476 [3:17:17<25:48:23, 23.41s/it] + 11%|█▏ | 508/4476 [3:17:41<25:47:37, 23.40s/it] + 11%|█▏ | 509/4476 [3:18:04<25:45:38, 23.38s/it] + 11%|█▏ | 510/4476 [3:18:28<25:47:43, 23.41s/it] + +{'loss': 0.4429, 'learning_rate': 4.841537057324979e-05, 'epoch': 0.34} + + 11%|█▏ | 510/4476 [3:18:28<25:47:43, 23.41s/it] + 11%|█▏ | 511/4476 [3:18:51<25:47:31, 23.42s/it] + 11%|█▏ | 512/4476 [3:19:14<25:44:47, 23.38s/it] + 11%|█▏ | 513/4476 [3:19:37<25:37:15, 23.27s/it] + 11%|█▏ | 514/4476 [3:20:01<25:40:41, 23.33s/it] + 12%|█▏ | 515/4476 [3:20:24<25:38:47, 23.31s/it] + 12%|█▏ | 516/4476 [3:20:48<25:43:33, 23.39s/it] + 12%|█▏ | 517/4476 [3:21:11<25:37:56, 23.31s/it] + 12%|█▏ | 518/4476 [3:21:34<25:36:13, 23.29s/it] + 12%|█▏ | 519/4476 [3:21:57<25:40:21, 23.36s/it] + 12%|█▏ | 520/4476 [3:22:21<25:48:45, 23.49s/it] + +{'loss': 0.4389, 'learning_rate': 4.835331695590943e-05, 'epoch': 0.35} + + 12%|█▏ | 520/4476 [3:22:21<25:48:45, 23.49s/it] + 12%|█▏ | 521/4476 [3:22:45<25:43:36, 23.42s/it] + 12%|█▏ | 522/4476 [3:23:08<25:40:35, 23.38s/it] + 12%|█▏ | 523/4476 [3:23:31<25:42:01, 23.41s/it] + 12%|█▏ | 524/4476 [3:23:54<25:36:01, 23.32s/it] + 12%|█▏ | 525/4476 [3:24:18<25:35:16, 23.31s/it] + 12%|█▏ | 526/4476 [3:24:41<25:35:49, 23.33s/it] + 12%|█▏ | 527/4476 [3:25:04<25:36:52, 23.35s/it] + 12%|█▏ | 528/4476 [3:25:28<25:45:20, 23.49s/it] + 12%|█▏ | 529/4476 [3:25:51<25:39:52, 23.41s/it] + 12%|█▏ | 530/4476 [3:26:15<25:41:15, 23.44s/it] + +{'loss': 0.44, 'learning_rate': 4.829011289219276e-05, 'epoch': 0.36} + + 12%|█▏ | 530/4476 [3:26:15<25:41:15, 23.44s/it] + 12%|█▏ | 531/4476 [3:26:38<25:39:05, 23.41s/it] + 12%|█▏ | 532/4476 [3:27:02<25:42:47, 23.47s/it] + 12%|█▏ | 533/4476 [3:27:25<25:43:46, 23.49s/it] + 12%|█▏ | 534/4476 [3:27:49<25:40:31, 23.45s/it] + 12%|█▏ | 535/4476 [3:28:13<25:44:26, 23.51s/it] + 12%|█▏ | 536/4476 [3:28:36<25:36:38, 23.40s/it] + 12%|█▏ | 537/4476 [3:28:59<25:43:35, 23.51s/it] + 12%|█▏ | 538/4476 [3:29:23<25:38:58, 23.45s/it] + 12%|█▏ | 539/4476 [3:29:46<25:30:54, 23.33s/it] + 12%|█▏ | 540/4476 [3:30:09<25:30:56, 23.34s/it] + +{'loss': 0.4476, 'learning_rate': 4.82257614956997e-05, 'epoch': 0.36} + + 12%|█▏ | 540/4476 [3:30:09<25:30:56, 23.34s/it] + 12%|█▏ | 541/4476 [3:30:32<25:30:35, 23.34s/it] + 12%|█▏ | 542/4476 [3:30:56<25:31:01, 23.35s/it] + 12%|█▏ | 543/4476 [3:31:19<25:28:05, 23.31s/it] + 12%|█▏ | 544/4476 [3:31:42<25:28:05, 23.32s/it] + 12%|█▏ | 545/4476 [3:32:06<25:26:07, 23.29s/it] + 12%|█▏ | 546/4476 [3:32:29<25:29:25, 23.35s/it] + 12%|█▏ | 547/4476 [3:32:52<25:28:42, 23.35s/it] + 12%|█▏ | 548/4476 [3:33:16<25:36:18, 23.47s/it] + 12%|█▏ | 549/4476 [3:33:39<25:32:01, 23.41s/it] + 12%|█▏ | 550/4476 [3:34:03<25:33:50, 23.44s/it] + +{'loss': 0.4367, 'learning_rate': 4.816026593655085e-05, 'epoch': 0.37} + + 12%|█▏ | 550/4476 [3:34:03<25:33:50, 23.44s/it] + 12%|█▏ | 551/4476 [3:34:26<25:29:15, 23.38s/it] + 12%|█▏ | 552/4476 [3:34:49<25:23:32, 23.30s/it] + 12%|█▏ | 553/4476 [3:35:12<25:17:49, 23.21s/it] + 12%|█▏ | 554/4476 [3:35:36<25:19:31, 23.25s/it] + 12%|█▏ | 555/4476 [3:35:59<25:22:41, 23.30s/it] + 12%|█▏ | 556/4476 [3:36:22<25:19:48, 23.26s/it] + 12%|█▏ | 557/4476 [3:36:46<25:24:10, 23.34s/it] + 12%|█▏ | 558/4476 [3:37:09<25:17:35, 23.24s/it] + 12%|█▏ | 559/4476 [3:37:32<25:23:11, 23.33s/it] + 13%|█▎ | 560/4476 [3:37:56<25:29:18, 23.43s/it] + +{'loss': 0.4357, 'learning_rate': 4.809362944123129e-05, 'epoch': 0.38} + + 13%|█▎ | 560/4476 [3:37:56<25:29:18, 23.43s/it] + 13%|█▎ | 561/4476 [3:38:20<25:36:37, 23.55s/it] + 13%|█▎ | 562/4476 [3:38:43<25:35:09, 23.53s/it] + 13%|█▎ | 563/4476 [3:39:07<25:30:46, 23.47s/it] + 13%|█▎ | 564/4476 [3:39:30<25:36:51, 23.57s/it] + 13%|█▎ | 565/4476 [3:39:53<25:25:00, 23.40s/it] + 13%|█▎ | 566/4476 [3:40:17<25:19:57, 23.32s/it] + 13%|█▎ | 567/4476 [3:40:40<25:26:35, 23.43s/it] + 13%|█▎ | 568/4476 [3:41:04<25:26:38, 23.44s/it] + 13%|█▎ | 569/4476 [3:41:27<25:23:24, 23.40s/it] + 13%|█▎ | 570/4476 [3:41:50<25:19:26, 23.34s/it] + +{'loss': 0.4492, 'learning_rate': 4.802585529243164e-05, 'epoch': 0.38} + + 13%|█▎ | 570/4476 [3:41:50<25:19:26, 23.34s/it] + 13%|█▎ | 571/4476 [3:42:13<25:12:18, 23.24s/it] + 13%|█▎ | 572/4476 [3:42:37<25:16:56, 23.31s/it] + 13%|█▎ | 573/4476 [3:43:00<25:20:01, 23.37s/it] + 13%|█▎ | 574/4476 [3:43:24<25:20:38, 23.38s/it] + 13%|█▎ | 575/4476 [3:43:47<25:12:25, 23.26s/it] + 13%|█▎ | 576/4476 [3:44:10<25:16:32, 23.33s/it] + 13%|█▎ | 577/4476 [3:44:34<25:20:20, 23.40s/it] + 13%|█▎ | 578/4476 [3:44:57<25:18:18, 23.37s/it] + 13%|█▎ | 579/4476 [3:45:20<25:17:23, 23.36s/it] + 13%|█▎ | 580/4476 [3:45:44<25:25:06, 23.49s/it] + +{'loss': 0.4403, 'learning_rate': 4.795694682888635e-05, 'epoch': 0.39} + + 13%|█▎ | 580/4476 [3:45:44<25:25:06, 23.49s/it] + 13%|█▎ | 581/4476 [3:46:07<25:17:40, 23.38s/it] + 13%|█▎ | 582/4476 [3:46:30<25:14:42, 23.34s/it] + 13%|█▎ | 583/4476 [3:46:54<25:14:01, 23.33s/it] + 13%|█▎ | 584/4476 [3:47:17<25:16:55, 23.39s/it] + 13%|█▎ | 585/4476 [3:47:41<25:14:00, 23.35s/it] + 13%|█▎ | 586/4476 [3:48:04<25:14:43, 23.36s/it] + 13%|█▎ | 587/4476 [3:48:27<25:17:44, 23.42s/it] + 13%|█▎ | 588/4476 [3:48:51<25:14:53, 23.38s/it] + 13%|█▎ | 589/4476 [3:49:14<25:10:18, 23.31s/it] + 13%|█▎ | 590/4476 [3:49:37<25:02:22, 23.20s/it] + +{'loss': 0.4406, 'learning_rate': 4.7886907445209234e-05, 'epoch': 0.4} + + 13%|█▎ | 590/4476 [3:49:37<25:02:22, 23.20s/it] + 13%|█▎ | 591/4476 [3:50:00<24:59:08, 23.15s/it] + 13%|█▎ | 592/4476 [3:50:24<25:10:16, 23.33s/it] + 13%|█▎ | 593/4476 [3:50:47<25:09:36, 23.33s/it] + 13%|█▎ | 594/4476 [3:51:10<25:06:08, 23.28s/it] + 13%|█▎ | 595/4476 [3:51:33<25:05:18, 23.27s/it] + 13%|█▎ | 596/4476 [3:51:57<25:08:21, 23.33s/it] + 13%|█▎ | 597/4476 [3:52:20<25:12:58, 23.40s/it] + 13%|█▎ | 598/4476 [3:52:44<25:15:28, 23.45s/it] + 13%|█▎ | 599/4476 [3:53:07<25:06:21, 23.31s/it] + 13%|█▎ | 600/4476 [3:53:31<25:10:21, 23.38s/it] + +{'loss': 0.4317, 'learning_rate': 4.781574059172621e-05, 'epoch': 0.4} + + 13%|█▎ | 600/4476 [3:53:31<25:10:21, 23.38s/it][INFO|trainer.py:2939] 2023-11-12 07:17:18,259 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-600 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 07:17:18,290 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-600/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 07:17:18,290 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-600/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 07:17:18,290 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-600/added_tokens.json + + 13%|█▎ | 601/4476 [3:53:54<25:03:31, 23.28s/it] + 13%|█▎ | 602/4476 [3:54:17<25:13:27, 23.44s/it] + 13%|█▎ | 603/4476 [3:54:41<25:08:54, 23.38s/it] + 13%|█▎ | 604/4476 [3:55:04<25:11:36, 23.42s/it] + 14%|█▎ | 605/4476 [3:55:27<24:59:09, 23.24s/it] + 14%|█▎ | 606/4476 [3:55:50<24:50:56, 23.12s/it] + 14%|█▎ | 607/4476 [3:56:13<24:53:51, 23.17s/it] + 14%|█▎ | 608/4476 [3:56:37<24:58:54, 23.25s/it] + 14%|█▎ | 609/4476 [3:57:00<25:08:42, 23.41s/it] + 14%|█▎ | 610/4476 [3:57:24<25:15:34, 23.52s/it] + +{'loss': 0.4379, 'learning_rate': 4.7743449774305386e-05, 'epoch': 0.41} + + 14%|█▎ | 610/4476 [3:57:24<25:15:34, 23.52s/it] + 14%|█▎ | 611/4476 [3:57:47<25:11:11, 23.46s/it] + 14%|█▎ | 612/4476 [3:58:11<25:14:14, 23.51s/it] + 14%|█▎ | 613/4476 [3:58:34<25:06:30, 23.40s/it] + 14%|█▎ | 614/4476 [3:58:58<25:05:47, 23.39s/it] + 14%|█▎ | 615/4476 [3:59:21<25:12:13, 23.50s/it] + 14%|█▍ | 616/4476 [3:59:45<25:14:59, 23.55s/it] + 14%|█▍ | 617/4476 [4:00:08<25:11:27, 23.50s/it] + 14%|█▍ | 618/4476 [4:00:32<25:11:34, 23.51s/it] + 14%|█▍ | 619/4476 [4:00:55<25:07:15, 23.45s/it] + 14%|█▍ | 620/4476 [4:01:19<25:09:01, 23.48s/it] + +{'loss': 0.4324, 'learning_rate': 4.7670038554184296e-05, 'epoch': 0.42} + + 14%|█▍ | 620/4476 [4:01:19<25:09:01, 23.48s/it] + 14%|█▍ | 621/4476 [4:01:42<25:06:03, 23.44s/it] + 14%|█▍ | 622/4476 [4:02:05<25:02:50, 23.40s/it] + 14%|█▍ | 623/4476 [4:02:29<25:00:39, 23.37s/it] + 14%|█▍ | 624/4476 [4:02:52<24:58:55, 23.35s/it] + 14%|█▍ | 625/4476 [4:03:15<24:57:38, 23.33s/it] + 14%|█▍ | 626/4476 [4:03:39<25:01:36, 23.40s/it] + 14%|█▍ | 627/4476 [4:04:02<24:59:48, 23.38s/it] + 14%|█▍ | 628/4476 [4:04:25<24:54:01, 23.30s/it] + 14%|█▍ | 629/4476 [4:04:49<24:55:03, 23.32s/it] + 14%|█▍ | 630/4476 [4:05:12<25:00:06, 23.40s/it] + +{'loss': 0.4329, 'learning_rate': 4.7595510547794465e-05, 'epoch': 0.42} + + 14%|█▍ | 630/4476 [4:05:12<25:00:06, 23.40s/it] + 14%|█▍ | 631/4476 [4:05:36<25:06:44, 23.51s/it] + 14%|█▍ | 632/4476 [4:05:59<24:58:23, 23.39s/it] + 14%|█▍ | 633/4476 [4:06:22<24:55:13, 23.34s/it] + 14%|█▍ | 634/4476 [4:06:46<24:53:04, 23.32s/it] + 14%|█▍ | 635/4476 [4:07:09<24:54:10, 23.34s/it] + 14%|█▍ | 636/4476 [4:07:32<24:55:39, 23.37s/it] + 14%|█▍ | 637/4476 [4:07:56<25:03:07, 23.49s/it] + 14%|█▍ | 638/4476 [4:08:20<25:02:48, 23.49s/it] + 14%|█▍ | 639/4476 [4:08:43<25:06:39, 23.56s/it] + 14%|█▍ | 640/4476 [4:09:07<24:58:40, 23.44s/it] + +{'loss': 0.4259, 'learning_rate': 4.751986942658332e-05, 'epoch': 0.43} + + 14%|█▍ | 640/4476 [4:09:07<24:58:40, 23.44s/it] + 14%|█▍ | 641/4476 [4:09:30<24:55:36, 23.40s/it] + 14%|█▍ | 642/4476 [4:09:53<24:50:38, 23.33s/it] + 14%|█▍ | 643/4476 [4:10:17<24:58:35, 23.46s/it] + 14%|█▍ | 644/4476 [4:10:40<24:59:15, 23.47s/it] + 14%|█▍ | 645/4476 [4:11:04<24:59:32, 23.49s/it] + 14%|█▍ | 646/4476 [4:11:27<24:57:56, 23.47s/it] + 14%|█▍ | 647/4476 [4:11:50<24:53:14, 23.40s/it] + 14%|█▍ | 648/4476 [4:12:14<24:58:51, 23.49s/it] + 14%|█▍ | 649/4476 [4:12:38<24:56:12, 23.46s/it] + 15%|█▍ | 650/4476 [4:13:01<24:52:22, 23.40s/it] + +{'loss': 0.4256, 'learning_rate': 4.744311891683325e-05, 'epoch': 0.44} + + 15%|█▍ | 650/4476 [4:13:01<24:52:22, 23.40s/it] + 15%|█▍ | 651/4476 [4:13:24<24:46:33, 23.32s/it] + 15%|█▍ | 652/4476 [4:13:47<24:48:00, 23.35s/it] + 15%|█▍ | 653/4476 [4:14:11<24:53:05, 23.43s/it] + 15%|█▍ | 654/4476 [4:14:34<24:46:38, 23.34s/it] + 15%|█▍ | 655/4476 [4:14:58<24:49:57, 23.40s/it] + 15%|█▍ | 656/4476 [4:15:21<24:43:00, 23.29s/it] + 15%|█▍ | 657/4476 [4:15:44<24:44:13, 23.32s/it] + 15%|█▍ | 658/4476 [4:16:08<24:45:32, 23.35s/it] + 15%|█▍ | 659/4476 [4:16:31<24:48:55, 23.40s/it] + 15%|█▍ | 660/4476 [4:16:55<24:54:23, 23.50s/it] + +{'loss': 0.4289, 'learning_rate': 4.736526279947807e-05, 'epoch': 0.44} + + 15%|█▍ | 660/4476 [4:16:55<24:54:23, 23.50s/it] + 15%|█▍ | 661/4476 [4:17:18<24:58:11, 23.56s/it] + 15%|█▍ | 662/4476 [4:17:42<24:49:17, 23.43s/it] + 15%|█▍ | 663/4476 [4:18:05<24:44:59, 23.37s/it] + 15%|█▍ | 664/4476 [4:18:28<24:40:37, 23.30s/it] + 15%|█▍ | 665/4476 [4:18:51<24:35:03, 23.22s/it] + 15%|█▍ | 666/4476 [4:19:14<24:38:16, 23.28s/it] + 15%|█▍ | 667/4476 [4:19:38<24:37:00, 23.27s/it] + 15%|█▍ | 668/4476 [4:20:01<24:39:42, 23.31s/it] + 15%|█▍ | 669/4476 [4:20:25<24:44:24, 23.39s/it] + 15%|█▍ | 670/4476 [4:20:48<24:41:13, 23.35s/it] + +{'loss': 0.4353, 'learning_rate': 4.728630490991676e-05, 'epoch': 0.45} + + 15%|█▍ | 670/4476 [4:20:48<24:41:13, 23.35s/it] + 15%|█▍ | 671/4476 [4:21:11<24:41:34, 23.36s/it] + 15%|█▌ | 672/4476 [4:21:35<24:39:08, 23.33s/it] + 15%|█▌ | 673/4476 [4:21:58<24:42:32, 23.39s/it] + 15%|█▌ | 674/4476 [4:22:21<24:40:54, 23.37s/it] + 15%|█▌ | 675/4476 [4:22:44<24:32:56, 23.25s/it] + 15%|█▌ | 676/4476 [4:23:08<24:38:26, 23.34s/it] + 15%|█▌ | 677/4476 [4:23:31<24:37:54, 23.34s/it] + 15%|█▌ | 678/4476 [4:23:55<24:36:27, 23.32s/it] + 15%|█▌ | 679/4476 [4:24:18<24:34:10, 23.29s/it] + 15%|█▌ | 680/4476 [4:24:41<24:36:39, 23.34s/it] + +{'loss': 0.4413, 'learning_rate': 4.7206249137824535e-05, 'epoch': 0.46} + + 15%|█▌ | 680/4476 [4:24:41<24:36:39, 23.34s/it] + 15%|█▌ | 681/4476 [4:25:05<24:39:54, 23.40s/it] + 15%|█▌ | 682/4476 [4:25:28<24:36:13, 23.35s/it] + 15%|█▌ | 683/4476 [4:25:51<24:35:14, 23.34s/it] + 15%|█▌ | 684/4476 [4:26:15<24:32:45, 23.30s/it] + 15%|█▌ | 685/4476 [4:26:38<24:28:17, 23.24s/it] + 15%|█▌ | 686/4476 [4:27:01<24:38:22, 23.40s/it] + 15%|█▌ | 687/4476 [4:27:25<24:46:09, 23.53s/it] + 15%|█▌ | 688/4476 [4:27:49<24:40:21, 23.45s/it] + 15%|█▌ | 689/4476 [4:28:12<24:35:53, 23.38s/it] + 15%|█▌ | 690/4476 [4:28:35<24:35:00, 23.38s/it] + +{'loss': 0.4302, 'learning_rate': 4.7125099426961185e-05, 'epoch': 0.46} + + 15%|█▌ | 690/4476 [4:28:35<24:35:00, 23.38s/it] + 15%|█▌ | 691/4476 [4:28:58<24:27:22, 23.26s/it] + 15%|█▌ | 692/4476 [4:29:21<24:25:53, 23.24s/it] + 15%|█▌ | 693/4476 [4:29:44<24:24:50, 23.23s/it] + 16%|█▌ | 694/4476 [4:30:08<24:33:17, 23.37s/it] + 16%|█▌ | 695/4476 [4:30:31<24:30:14, 23.33s/it] + 16%|█▌ | 696/4476 [4:30:55<24:28:12, 23.30s/it] + 16%|█▌ | 697/4476 [4:31:18<24:36:31, 23.44s/it] + 16%|█▌ | 698/4476 [4:31:42<24:34:48, 23.42s/it] + 16%|█▌ | 699/4476 [4:32:05<24:29:14, 23.34s/it] + 16%|█▌ | 700/4476 [4:32:28<24:28:20, 23.33s/it] + +{'loss': 0.4365, 'learning_rate': 4.704285977497687e-05, 'epoch': 0.47} + + 16%|█▌ | 700/4476 [4:32:28<24:28:20, 23.33s/it] + 16%|█▌ | 701/4476 [4:32:52<24:28:08, 23.33s/it] + 16%|█▌ | 702/4476 [4:33:15<24:21:10, 23.23s/it] + 16%|█▌ | 703/4476 [4:33:38<24:25:59, 23.31s/it] + 16%|█▌ | 704/4476 [4:34:02<24:33:32, 23.44s/it] + 16%|█▌ | 705/4476 [4:34:25<24:35:59, 23.48s/it] + 16%|█▌ | 706/4476 [4:34:49<24:31:04, 23.41s/it] + 16%|█▌ | 707/4476 [4:35:12<24:24:56, 23.32s/it] + 16%|█▌ | 708/4476 [4:35:35<24:22:08, 23.28s/it] + 16%|█▌ | 709/4476 [4:35:58<24:22:38, 23.30s/it] + 16%|█▌ | 710/4476 [4:36:22<24:25:17, 23.35s/it] + +{'loss': 0.4238, 'learning_rate': 4.6959534233215116e-05, 'epoch': 0.48} + + 16%|█▌ | 710/4476 [4:36:22<24:25:17, 23.35s/it] + 16%|█▌ | 711/4476 [4:36:45<24:32:12, 23.46s/it] + 16%|█▌ | 712/4476 [4:37:09<24:32:08, 23.47s/it] + 16%|█▌ | 713/4476 [4:37:33<24:33:52, 23.50s/it] + 16%|█▌ | 714/4476 [4:37:56<24:31:30, 23.47s/it] + 16%|█▌ | 715/4476 [4:38:19<24:26:57, 23.40s/it] + 16%|█▌ | 716/4476 [4:38:43<24:27:39, 23.42s/it] + 16%|█▌ | 717/4476 [4:39:06<24:24:44, 23.38s/it] + 16%|█▌ | 718/4476 [4:39:29<24:21:04, 23.33s/it] + 16%|█▌ | 719/4476 [4:39:52<24:16:13, 23.26s/it] + 16%|█▌ | 720/4476 [4:40:16<24:19:56, 23.32s/it] + +{'loss': 0.4284, 'learning_rate': 4.687512690651328e-05, 'epoch': 0.48} + + 16%|█▌ | 720/4476 [4:40:16<24:19:56, 23.32s/it] + 16%|█▌ | 721/4476 [4:40:39<24:18:47, 23.31s/it] + 16%|█▌ | 722/4476 [4:41:02<24:18:08, 23.31s/it] + 16%|█▌ | 723/4476 [4:41:26<24:18:05, 23.31s/it] + 16%|█▌ | 724/4476 [4:41:49<24:18:23, 23.32s/it] + 16%|█▌ | 725/4476 [4:42:12<24:17:28, 23.31s/it] + 16%|█▌ | 726/4476 [4:42:36<24:17:36, 23.32s/it] + 16%|█▌ | 727/4476 [4:42:59<24:10:37, 23.22s/it] + 16%|█▋ | 728/4476 [4:43:22<24:10:57, 23.23s/it] + 16%|█▋ | 729/4476 [4:43:45<24:10:16, 23.22s/it] + 16%|█▋ | 730/4476 [4:44:09<24:15:21, 23.31s/it] + +{'loss': 0.4193, 'learning_rate': 4.678964195300028e-05, 'epoch': 0.49} + + 16%|█▋ | 730/4476 [4:44:09<24:15:21, 23.31s/it] + 16%|█▋ | 731/4476 [4:44:32<24:09:53, 23.23s/it] + 16%|█▋ | 732/4476 [4:44:55<24:17:00, 23.35s/it] + 16%|█▋ | 733/4476 [4:45:18<24:12:03, 23.28s/it] + 16%|█▋ | 734/4476 [4:45:42<24:13:12, 23.30s/it] + 16%|█▋ | 735/4476 [4:46:05<24:12:50, 23.30s/it] + 16%|█▋ | 736/4476 [4:46:28<24:11:18, 23.28s/it] + 16%|█▋ | 737/4476 [4:46:51<24:10:36, 23.28s/it] + 16%|█▋ | 738/4476 [4:47:15<24:14:52, 23.35s/it] + 17%|█▋ | 739/4476 [4:47:38<24:10:31, 23.29s/it] + 17%|█▋ | 740/4476 [4:48:01<24:08:33, 23.26s/it] + +{'loss': 0.4256, 'learning_rate': 4.670308358389184e-05, 'epoch': 0.5} + + 17%|█▋ | 740/4476 [4:48:01<24:08:33, 23.26s/it] + 17%|█▋ | 741/4476 [4:48:24<24:02:12, 23.17s/it] + 17%|█▋ | 742/4476 [4:48:48<24:05:21, 23.22s/it] + 17%|█▋ | 743/4476 [4:49:11<24:03:50, 23.21s/it] + 17%|█▋ | 744/4476 [4:49:34<24:08:37, 23.29s/it] + 17%|█▋ | 745/4476 [4:49:57<24:03:37, 23.22s/it] + 17%|█▋ | 746/4476 [4:50:21<24:07:04, 23.28s/it] + 17%|█▋ | 747/4476 [4:50:44<24:06:07, 23.27s/it] + 17%|█▋ | 748/4476 [4:51:07<24:02:26, 23.22s/it] + 17%|█▋ | 749/4476 [4:51:30<23:56:58, 23.13s/it] + 17%|█▋ | 750/4476 [4:51:53<23:57:39, 23.15s/it] + +{'loss': 0.4288, 'learning_rate': 4.6615456063282944e-05, 'epoch': 0.5} + + 17%|█▋ | 750/4476 [4:51:53<23:57:39, 23.15s/it] + 17%|█▋ | 751/4476 [4:52:16<23:56:50, 23.14s/it] + 17%|█▋ | 752/4476 [4:52:40<23:58:16, 23.17s/it] + 17%|█▋ | 753/4476 [4:53:03<23:59:33, 23.20s/it] + 17%|█▋ | 754/4476 [4:53:26<24:02:12, 23.25s/it] + 17%|█▋ | 755/4476 [4:53:49<24:01:14, 23.24s/it] + 17%|█▋ | 756/4476 [4:54:13<24:10:30, 23.40s/it] + 17%|█▋ | 757/4476 [4:54:37<24:15:06, 23.48s/it] + 17%|█▋ | 758/4476 [4:55:01<24:19:44, 23.56s/it] + 17%|█▋ | 759/4476 [4:55:24<24:15:07, 23.49s/it] + 17%|█▋ | 760/4476 [4:55:47<24:08:20, 23.39s/it] + +{'loss': 0.4335, 'learning_rate': 4.652676370793784e-05, 'epoch': 0.51} + + 17%|█▋ | 760/4476 [4:55:47<24:08:20, 23.39s/it] + 17%|█▋ | 761/4476 [4:56:11<24:11:45, 23.45s/it] + 17%|█▋ | 762/4476 [4:56:34<24:01:55, 23.29s/it] + 17%|█▋ | 763/4476 [4:56:57<24:10:19, 23.44s/it] + 17%|█▋ | 764/4476 [4:57:21<24:11:20, 23.46s/it] + 17%|█▋ | 765/4476 [4:57:44<24:12:40, 23.49s/it] + 17%|█▋ | 766/4476 [4:58:08<24:17:29, 23.57s/it] + 17%|█▋ | 767/4476 [4:58:32<24:17:35, 23.58s/it] + 17%|█▋ | 768/4476 [4:58:56<24:21:25, 23.65s/it] + 17%|█▋ | 769/4476 [4:59:19<24:18:46, 23.61s/it] + 17%|█▋ | 770/4476 [4:59:42<24:09:59, 23.48s/it] + +{'loss': 0.4271, 'learning_rate': 4.643701088707736e-05, 'epoch': 0.52} + + 17%|█▋ | 770/4476 [4:59:42<24:09:59, 23.48s/it] + 17%|█▋ | 771/4476 [5:00:06<24:09:55, 23.48s/it] + 17%|█▋ | 772/4476 [5:00:29<24:10:09, 23.49s/it] + 17%|█▋ | 773/4476 [5:00:53<24:09:58, 23.49s/it] + 17%|█▋ | 774/4476 [5:01:16<24:08:28, 23.48s/it] + 17%|█▋ | 775/4476 [5:01:40<24:05:22, 23.43s/it] + 17%|█▋ | 776/4476 [5:02:03<24:05:09, 23.43s/it] + 17%|█▋ | 777/4476 [5:02:27<24:07:07, 23.47s/it] + 17%|█▋ | 778/4476 [5:02:50<24:01:08, 23.38s/it] + 17%|█▋ | 779/4476 [5:03:14<24:08:09, 23.50s/it] + 17%|█▋ | 780/4476 [5:03:37<24:02:39, 23.42s/it] + +{'loss': 0.4304, 'learning_rate': 4.634620202216366e-05, 'epoch': 0.52} + + 17%|█▋ | 780/4476 [5:03:37<24:02:39, 23.42s/it] + 17%|█▋ | 781/4476 [5:04:00<23:59:43, 23.38s/it] + 17%|█▋ | 782/4476 [5:04:23<23:58:39, 23.37s/it] + 17%|█▋ | 783/4476 [5:04:47<23:58:37, 23.37s/it] + 18%|█▊ | 784/4476 [5:05:10<23:53:21, 23.29s/it] + 18%|█▊ | 785/4476 [5:05:33<23:53:27, 23.30s/it] + 18%|█▊ | 786/4476 [5:05:57<23:53:44, 23.31s/it] + 18%|█▊ | 787/4476 [5:06:20<23:57:55, 23.39s/it] + 18%|█▊ | 788/4476 [5:06:43<23:53:06, 23.32s/it] + 18%|█▊ | 789/4476 [5:07:07<23:51:29, 23.30s/it] + 18%|█▊ | 790/4476 [5:07:30<24:00:43, 23.45s/it] + +{'loss': 0.4249, 'learning_rate': 4.625434158668246e-05, 'epoch': 0.53} + + 18%|█▊ | 790/4476 [5:07:30<24:00:43, 23.45s/it] + 18%|█▊ | 791/4476 [5:07:54<23:58:44, 23.43s/it] + 18%|█▊ | 792/4476 [5:08:17<24:00:35, 23.46s/it] + 18%|█▊ | 793/4476 [5:08:40<23:54:43, 23.37s/it] + 18%|█▊ | 794/4476 [5:09:04<23:56:42, 23.41s/it] + 18%|█▊ | 795/4476 [5:09:27<23:51:06, 23.33s/it] + 18%|█▊ | 796/4476 [5:09:50<23:47:05, 23.27s/it] + 18%|█▊ | 797/4476 [5:10:13<23:46:27, 23.26s/it] + 18%|█▊ | 798/4476 [5:10:37<23:47:39, 23.29s/it] + 18%|█▊ | 799/4476 [5:11:00<23:51:58, 23.37s/it] + 18%|█▊ | 800/4476 [5:11:24<23:51:32, 23.37s/it] + +{'loss': 0.4322, 'learning_rate': 4.6161434105922616e-05, 'epoch': 0.54} + + 18%|█▊ | 800/4476 [5:11:24<23:51:32, 23.37s/it][INFO|trainer.py:2939] 2023-11-12 08:35:11,424 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-800 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 08:35:11,456 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-800/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 08:35:11,456 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-800/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 08:35:11,456 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-800/added_tokens.json + + 18%|█▊ | 801/4476 [5:11:47<23:53:34, 23.41s/it] + 18%|█▊ | 802/4476 [5:12:11<23:55:52, 23.45s/it] + 18%|█▊ | 803/4476 [5:12:35<24:01:29, 23.55s/it] + 18%|█▊ | 804/4476 [5:12:58<23:59:45, 23.53s/it] + 18%|█▊ | 805/4476 [5:13:21<23:53:14, 23.43s/it] + 18%|█▊ | 806/4476 [5:13:45<23:54:56, 23.46s/it] + 18%|█▊ | 807/4476 [5:14:08<23:53:10, 23.44s/it] + 18%|█▊ | 808/4476 [5:14:31<23:45:59, 23.33s/it] + 18%|█▊ | 809/4476 [5:14:54<23:39:20, 23.22s/it] + 18%|█▊ | 810/4476 [5:15:17<23:33:04, 23.13s/it] + +{'loss': 0.4229, 'learning_rate': 4.6067484156753234e-05, 'epoch': 0.54} + + 18%|█▊ | 810/4476 [5:15:17<23:33:04, 23.13s/it] + 18%|█▊ | 811/4476 [5:15:40<23:36:43, 23.19s/it] + 18%|█▊ | 812/4476 [5:16:04<23:36:52, 23.20s/it] + 18%|█▊ | 813/4476 [5:16:27<23:38:49, 23.24s/it] + 18%|█▊ | 814/4476 [5:16:50<23:42:26, 23.31s/it] + 18%|█▊ | 815/4476 [5:17:14<23:44:09, 23.34s/it] + 18%|█▊ | 816/4476 [5:17:37<23:34:17, 23.19s/it] + 18%|█▊ | 817/4476 [5:18:00<23:33:52, 23.18s/it] + 18%|█▊ | 818/4476 [5:18:23<23:39:37, 23.29s/it] + 18%|█▊ | 819/4476 [5:18:46<23:35:11, 23.22s/it] + 18%|█▊ | 820/4476 [5:19:10<23:33:54, 23.20s/it] + +{'loss': 0.4252, 'learning_rate': 4.597249636739815e-05, 'epoch': 0.55} + + 18%|█▊ | 820/4476 [5:19:10<23:33:54, 23.20s/it] + 18%|█▊ | 821/4476 [5:19:33<23:34:02, 23.21s/it] + 18%|█▊ | 822/4476 [5:19:56<23:33:45, 23.21s/it] + 18%|█▊ | 823/4476 [5:20:20<23:43:44, 23.38s/it] + 18%|█▊ | 824/4476 [5:20:43<23:44:35, 23.41s/it] + 18%|█▊ | 825/4476 [5:21:07<23:43:06, 23.39s/it] + 18%|█▊ | 826/4476 [5:21:30<23:41:39, 23.37s/it] + 18%|█▊ | 827/4476 [5:21:53<23:35:20, 23.27s/it] + 18%|█▊ | 828/4476 [5:22:16<23:35:35, 23.28s/it] + 19%|█▊ | 829/4476 [5:22:39<23:33:24, 23.25s/it] + 19%|█▊ | 830/4476 [5:23:03<23:40:40, 23.38s/it] + +{'loss': 0.413, 'learning_rate': 4.5876475417207974e-05, 'epoch': 0.56} + + 19%|█▊ | 830/4476 [5:23:03<23:40:40, 23.38s/it] + 19%|█▊ | 831/4476 [5:23:27<23:45:57, 23.47s/it] + 19%|█▊ | 832/4476 [5:23:50<23:41:22, 23.40s/it] + 19%|█▊ | 833/4476 [5:24:14<23:47:54, 23.52s/it] + 19%|█▊ | 834/4476 [5:24:37<23:44:21, 23.47s/it] + 19%|█▊ | 835/4476 [5:25:01<23:40:50, 23.41s/it] + 19%|█▊ | 836/4476 [5:25:24<23:43:16, 23.46s/it] + 19%|█▊ | 837/4476 [5:25:48<23:48:17, 23.55s/it] + 19%|█▊ | 838/4476 [5:26:11<23:42:51, 23.47s/it] + 19%|█▊ | 839/4476 [5:26:34<23:35:28, 23.35s/it] + 19%|█▉ | 840/4476 [5:26:58<23:42:29, 23.47s/it] + +{'loss': 0.4186, 'learning_rate': 4.577942603642959e-05, 'epoch': 0.56} + + 19%|█▉ | 840/4476 [5:26:58<23:42:29, 23.47s/it] + 19%|█▉ | 841/4476 [5:27:21<23:35:55, 23.37s/it] + 19%|█▉ | 842/4476 [5:27:44<23:33:57, 23.35s/it] + 19%|█▉ | 843/4476 [5:28:08<23:36:37, 23.40s/it] + 19%|█▉ | 844/4476 [5:28:31<23:37:16, 23.41s/it] + 19%|█▉ | 845/4476 [5:28:55<23:34:38, 23.38s/it] + 19%|█▉ | 846/4476 [5:29:18<23:32:59, 23.36s/it] + 19%|█▉ | 847/4476 [5:29:41<23:31:59, 23.35s/it] + 19%|█▉ | 848/4476 [5:30:05<23:31:33, 23.34s/it] + 19%|█▉ | 849/4476 [5:30:28<23:28:34, 23.30s/it] + 19%|█▉ | 850/4476 [5:30:51<23:32:42, 23.38s/it] + +{'loss': 0.4233, 'learning_rate': 4.568135300597306e-05, 'epoch': 0.57} + + 19%|█▉ | 850/4476 [5:30:51<23:32:42, 23.38s/it] + 19%|█▉ | 851/4476 [5:31:15<23:35:31, 23.43s/it] + 19%|█▉ | 852/4476 [5:31:38<23:32:34, 23.39s/it] + 19%|█▉ | 853/4476 [5:32:01<23:30:19, 23.36s/it] + 19%|█▉ | 854/4476 [5:32:24<23:22:14, 23.23s/it] + 19%|█▉ | 855/4476 [5:32:48<23:23:13, 23.25s/it] + 19%|█▉ | 856/4476 [5:33:11<23:32:17, 23.41s/it] + 19%|█▉ | 857/4476 [5:33:35<23:28:35, 23.35s/it] + 19%|█▉ | 858/4476 [5:33:58<23:26:12, 23.32s/it] + 19%|█▉ | 859/4476 [5:34:21<23:23:06, 23.28s/it] + 19%|█▉ | 860/4476 [5:34:45<23:27:43, 23.36s/it] + +{'loss': 0.4177, 'learning_rate': 4.5582261157176164e-05, 'epoch': 0.58} + + 19%|█▉ | 860/4476 [5:34:45<23:27:43, 23.36s/it] + 19%|█▉ | 861/4476 [5:35:08<23:27:13, 23.36s/it] + 19%|█▉ | 862/4476 [5:35:31<23:24:50, 23.32s/it] + 19%|█▉ | 863/4476 [5:35:55<23:32:59, 23.47s/it] + 19%|█▉ | 864/4476 [5:36:18<23:23:56, 23.32s/it] + 19%|█▉ | 865/4476 [5:36:41<23:25:21, 23.35s/it] + 19%|█▉ | 866/4476 [5:37:05<23:24:38, 23.35s/it] + 19%|█▉ | 867/4476 [5:37:29<23:30:57, 23.46s/it] + 19%|█▉ | 868/4476 [5:37:52<23:28:21, 23.42s/it] + 19%|█▉ | 869/4476 [5:38:15<23:26:30, 23.40s/it] + 19%|█▉ | 870/4476 [5:38:39<23:24:21, 23.37s/it] + +{'loss': 0.4236, 'learning_rate': 4.5482155371566384e-05, 'epoch': 0.58} + + 19%|█▉ | 870/4476 [5:38:39<23:24:21, 23.37s/it] + 19%|█▉ | 871/4476 [5:39:02<23:31:35, 23.49s/it] + 19%|█▉ | 872/4476 [5:39:26<23:26:41, 23.42s/it] + 20%|█▉ | 873/4476 [5:39:49<23:25:24, 23.40s/it] + 20%|█▉ | 874/4476 [5:40:12<23:22:19, 23.36s/it] + 20%|█▉ | 875/4476 [5:40:35<23:19:39, 23.32s/it] + 20%|█▉ | 876/4476 [5:40:59<23:24:34, 23.41s/it] + 20%|█▉ | 877/4476 [5:41:22<23:18:28, 23.31s/it] + 20%|█▉ | 878/4476 [5:41:45<23:17:29, 23.30s/it] + 20%|█▉ | 879/4476 [5:42:09<23:19:04, 23.34s/it] + 20%|█▉ | 880/4476 [5:42:32<23:21:57, 23.39s/it] + +{'loss': 0.4228, 'learning_rate': 4.538104058062042e-05, 'epoch': 0.59} + + 20%|█▉ | 880/4476 [5:42:32<23:21:57, 23.39s/it] + 20%|█▉ | 881/4476 [5:42:56<23:19:38, 23.36s/it] + 20%|█▉ | 882/4476 [5:43:19<23:23:57, 23.44s/it] + 20%|█▉ | 883/4476 [5:43:43<23:21:07, 23.40s/it] + 20%|█▉ | 884/4476 [5:44:06<23:24:08, 23.45s/it] + 20%|█▉ | 885/4476 [5:44:30<23:23:44, 23.45s/it] + 20%|█▉ | 886/4476 [5:44:53<23:23:07, 23.45s/it] + 20%|█▉ | 887/4476 [5:45:16<23:15:57, 23.34s/it] + 20%|█▉ | 888/4476 [5:45:39<23:14:36, 23.32s/it] + 20%|█▉ | 889/4476 [5:46:02<23:09:12, 23.24s/it] + 20%|█▉ | 890/4476 [5:46:26<23:12:44, 23.30s/it] + +{'loss': 0.4181, 'learning_rate': 4.5278921765521234e-05, 'epoch': 0.6} + + 20%|█▉ | 890/4476 [5:46:26<23:12:44, 23.30s/it] + 20%|█▉ | 891/4476 [5:46:49<23:10:39, 23.27s/it] + 20%|█▉ | 892/4476 [5:47:12<23:11:36, 23.30s/it] + 20%|█▉ | 893/4476 [5:47:36<23:13:12, 23.33s/it] + 20%|█▉ | 894/4476 [5:47:59<23:09:59, 23.28s/it] + 20%|█▉ | 895/4476 [5:48:22<23:07:06, 23.24s/it] + 20%|██ | 896/4476 [5:48:46<23:09:09, 23.28s/it] + 20%|██ | 897/4476 [5:49:09<23:05:21, 23.22s/it] + 20%|██ | 898/4476 [5:49:32<23:07:07, 23.26s/it] + 20%|██ | 899/4476 [5:49:55<23:02:46, 23.19s/it] + 20%|██ | 900/4476 [5:50:18<23:03:32, 23.21s/it] + +{'loss': 0.4261, 'learning_rate': 4.51758039569127e-05, 'epoch': 0.6} + + 20%|██ | 900/4476 [5:50:18<23:03:32, 23.21s/it] + 20%|██ | 901/4476 [5:50:41<23:01:04, 23.18s/it] + 20%|██ | 902/4476 [5:51:05<23:01:26, 23.19s/it] + 20%|██ | 903/4476 [5:51:28<23:07:03, 23.29s/it] + 20%|██ | 904/4476 [5:51:52<23:09:25, 23.34s/it] + 20%|██ | 905/4476 [5:52:15<23:11:06, 23.37s/it] + 20%|██ | 906/4476 [5:52:38<23:08:54, 23.34s/it] + 20%|██ | 907/4476 [5:53:01<23:06:10, 23.30s/it] + 20%|██ | 908/4476 [5:53:25<23:12:41, 23.42s/it] + 20%|██ | 909/4476 [5:53:48<23:05:21, 23.30s/it] + 20%|██ | 910/4476 [5:54:12<23:07:57, 23.35s/it] + +{'loss': 0.4217, 'learning_rate': 4.5071692234651764e-05, 'epoch': 0.61} + + 20%|██ | 910/4476 [5:54:12<23:07:57, 23.35s/it] + 20%|██ | 911/4476 [5:54:35<23:06:28, 23.33s/it] + 20%|██ | 912/4476 [5:54:58<23:03:57, 23.30s/it] + 20%|██ | 913/4476 [5:55:21<23:00:57, 23.26s/it] + 20%|██ | 914/4476 [5:55:45<23:02:37, 23.29s/it] + 20%|██ | 915/4476 [5:56:08<23:10:34, 23.43s/it] + 20%|██ | 916/4476 [5:56:32<23:08:23, 23.40s/it] + 20%|██ | 917/4476 [5:56:55<23:10:01, 23.43s/it] + 21%|██ | 918/4476 [5:57:19<23:12:23, 23.48s/it] + 21%|██ | 919/4476 [5:57:43<23:16:15, 23.55s/it] + 21%|██ | 920/4476 [5:58:06<23:09:42, 23.45s/it] + +{'loss': 0.4191, 'learning_rate': 4.4966591727558184e-05, 'epoch': 0.62} + + 21%|██ | 920/4476 [5:58:06<23:09:42, 23.45s/it] + 21%|██ | 921/4476 [5:58:30<23:15:18, 23.55s/it] + 21%|██ | 922/4476 [5:58:53<23:10:19, 23.47s/it] + 21%|██ | 923/4476 [5:59:17<23:16:07, 23.58s/it] + 21%|██ | 924/4476 [5:59:40<23:05:30, 23.40s/it] + 21%|██ | 925/4476 [6:00:03<23:01:37, 23.34s/it] + 21%|██ | 926/4476 [6:00:27<23:09:42, 23.49s/it] + 21%|██ | 927/4476 [6:00:50<23:04:26, 23.41s/it] + 21%|██ | 928/4476 [6:01:13<22:56:42, 23.28s/it] + 21%|██ | 929/4476 [6:01:37<23:01:34, 23.37s/it] + 21%|██ | 930/4476 [6:02:00<23:04:20, 23.42s/it] + +{'loss': 0.4247, 'learning_rate': 4.48605076131619e-05, 'epoch': 0.62} + + 21%|██ | 930/4476 [6:02:00<23:04:20, 23.42s/it] + 21%|██ | 931/4476 [6:02:23<23:01:03, 23.37s/it] + 21%|██ | 932/4476 [6:02:46<22:55:40, 23.29s/it] + 21%|██ | 933/4476 [6:03:10<22:53:02, 23.25s/it] + 21%|██ | 934/4476 [6:03:33<22:53:37, 23.27s/it] + 21%|██ | 935/4476 [6:03:57<22:59:34, 23.38s/it] + 21%|██ | 936/4476 [6:04:20<22:59:10, 23.38s/it] + 21%|██ | 937/4476 [6:04:44<23:04:44, 23.48s/it] + 21%|██ | 938/4476 [6:05:07<22:59:08, 23.39s/it] + 21%|██ | 939/4476 [6:05:30<22:57:46, 23.37s/it] + 21%|██ | 940/4476 [6:05:53<22:55:43, 23.34s/it] + +{'loss': 0.4236, 'learning_rate': 4.475344511744794e-05, 'epoch': 0.63} + + 21%|██ | 940/4476 [6:05:53<22:55:43, 23.34s/it] + 21%|██ | 941/4476 [6:06:17<22:59:22, 23.41s/it] + 21%|██ | 942/4476 [6:06:41<23:02:43, 23.48s/it] + 21%|██ | 943/4476 [6:07:04<23:00:54, 23.45s/it] + 21%|██ | 944/4476 [6:07:27<23:00:34, 23.45s/it] + 21%|██ | 945/4476 [6:07:50<22:50:47, 23.29s/it] + 21%|██ | 946/4476 [6:08:14<22:55:36, 23.38s/it] + 21%|██ | 947/4476 [6:08:37<22:54:55, 23.38s/it] + 21%|██ | 948/4476 [6:09:01<22:55:35, 23.39s/it] + 21%|██ | 949/4476 [6:09:24<22:50:36, 23.32s/it] + 21%|██ | 950/4476 [6:09:47<22:49:41, 23.31s/it] + +{'loss': 0.4172, 'learning_rate': 4.464540951459902e-05, 'epoch': 0.64} + + 21%|██ | 950/4476 [6:09:47<22:49:41, 23.31s/it] + 21%|██ | 951/4476 [6:10:11<22:49:56, 23.32s/it] + 21%|██▏ | 952/4476 [6:10:34<22:58:44, 23.47s/it] + 21%|██▏ | 953/4476 [6:10:58<22:59:25, 23.49s/it] + 21%|██▏ | 954/4476 [6:11:22<23:03:03, 23.56s/it] + 21%|██▏ | 955/4476 [6:11:45<22:53:55, 23.41s/it] + 21%|██▏ | 956/4476 [6:12:08<22:51:48, 23.38s/it] + 21%|██▏ | 957/4476 [6:12:31<22:49:47, 23.36s/it] + 21%|██▏ | 958/4476 [6:12:55<22:54:24, 23.44s/it] + 21%|██▏ | 959/4476 [6:13:18<22:55:26, 23.46s/it] + 21%|██▏ | 960/4476 [6:13:42<22:52:09, 23.42s/it] + +{'loss': 0.4209, 'learning_rate': 4.4536406126735664e-05, 'epoch': 0.64} + + 21%|██▏ | 960/4476 [6:13:42<22:52:09, 23.42s/it] + 21%|██▏ | 961/4476 [6:14:05<22:45:26, 23.31s/it] + 21%|██▏ | 962/4476 [6:14:28<22:48:54, 23.37s/it] + 22%|██▏ | 963/4476 [6:14:52<22:51:17, 23.42s/it] + 22%|██▏ | 964/4476 [6:15:15<22:50:12, 23.41s/it] + 22%|██▏ | 965/4476 [6:15:38<22:45:46, 23.34s/it] + 22%|██▏ | 966/4476 [6:16:02<22:44:52, 23.33s/it] + 22%|██▏ | 967/4476 [6:16:25<22:44:05, 23.32s/it] + 22%|██▏ | 968/4476 [6:16:48<22:43:10, 23.32s/it] + 22%|██▏ | 969/4476 [6:17:12<22:45:40, 23.36s/it] + 22%|██▏ | 970/4476 [6:17:36<22:50:52, 23.46s/it] + +{'loss': 0.4179, 'learning_rate': 4.442644032365407e-05, 'epoch': 0.65} + + 22%|██▏ | 970/4476 [6:17:36<22:50:52, 23.46s/it] + 22%|██▏ | 971/4476 [6:17:59<22:50:55, 23.47s/it] + 22%|██▏ | 972/4476 [6:18:22<22:48:42, 23.44s/it] + 22%|██▏ | 973/4476 [6:18:46<22:49:03, 23.45s/it] + 22%|██▏ | 974/4476 [6:19:09<22:49:07, 23.46s/it] + 22%|██▏ | 975/4476 [6:19:33<22:49:15, 23.47s/it] + 22%|██▏ | 976/4476 [6:19:57<22:52:59, 23.54s/it] + 22%|██▏ | 977/4476 [6:20:20<22:52:54, 23.54s/it] + 22%|██▏ | 978/4476 [6:20:44<22:57:17, 23.62s/it] + 22%|██▏ | 979/4476 [6:21:07<22:51:45, 23.54s/it] + 22%|██▏ | 980/4476 [6:21:30<22:44:14, 23.41s/it] + +{'loss': 0.4166, 'learning_rate': 4.431551752256155e-05, 'epoch': 0.66} + + 22%|██▏ | 980/4476 [6:21:30<22:44:14, 23.41s/it] + 22%|██▏ | 981/4476 [6:21:54<22:48:58, 23.50s/it] + 22%|██▏ | 982/4476 [6:22:18<22:47:40, 23.49s/it] + 22%|██▏ | 983/4476 [6:22:41<22:48:43, 23.51s/it] + 22%|██▏ | 984/4476 [6:23:04<22:45:09, 23.46s/it] + 22%|██▏ | 985/4476 [6:23:28<22:43:48, 23.44s/it] + 22%|██▏ | 986/4476 [6:23:51<22:43:51, 23.45s/it] + 22%|██▏ | 987/4476 [6:24:15<22:42:47, 23.44s/it] + 22%|██▏ | 988/4476 [6:24:38<22:44:02, 23.46s/it] + 22%|██▏ | 989/4476 [6:25:02<22:48:57, 23.56s/it] + 22%|██▏ | 990/4476 [6:25:25<22:42:46, 23.46s/it] + +{'loss': 0.4173, 'learning_rate': 4.420364318780973e-05, 'epoch': 0.66} + + 22%|██▏ | 990/4476 [6:25:25<22:42:46, 23.46s/it] + 22%|██▏ | 991/4476 [6:25:49<22:46:02, 23.52s/it] + 22%|██▏ | 992/4476 [6:26:12<22:45:02, 23.51s/it] + 22%|██▏ | 993/4476 [6:26:36<22:44:45, 23.51s/it] + 22%|██▏ | 994/4476 [6:26:59<22:44:35, 23.51s/it] + 22%|██▏ | 995/4476 [6:27:22<22:35:31, 23.36s/it] + 22%|██▏ | 996/4476 [6:27:46<22:36:31, 23.39s/it] + 22%|██▏ | 997/4476 [6:28:09<22:39:27, 23.45s/it] + 22%|██▏ | 998/4476 [6:28:33<22:38:45, 23.44s/it] + 22%|██▏ | 999/4476 [6:28:56<22:37:21, 23.42s/it] + 22%|██▏ | 1000/4476 [6:29:20<22:36:52, 23.42s/it] + +{'loss': 0.4166, 'learning_rate': 4.4090822830625236e-05, 'epoch': 0.67} + + 22%|██▏ | 1000/4476 [6:29:20<22:36:52, 23.42s/it][INFO|trainer.py:2939] 2023-11-12 09:53:07,385 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1000 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 09:53:07,416 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1000/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 09:53:07,416 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1000/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 09:53:07,416 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1000/added_tokens.json + + 22%|██▏ | 1001/4476 [6:29:43<22:35:09, 23.40s/it] + 22%|██▏ | 1002/4476 [6:30:06<22:35:47, 23.42s/it] + 22%|██▏ | 1003/4476 [6:30:30<22:37:07, 23.45s/it] + 22%|██▏ | 1004/4476 [6:30:53<22:33:39, 23.39s/it] + 22%|██▏ | 1005/4476 [6:31:17<22:36:11, 23.44s/it] + 22%|██▏ | 1006/4476 [6:31:40<22:39:14, 23.50s/it] + 22%|██▏ | 1007/4476 [6:32:04<22:32:29, 23.39s/it] + 23%|██▎ | 1008/4476 [6:32:27<22:28:13, 23.33s/it] + 23%|██▎ | 1009/4476 [6:32:50<22:28:03, 23.33s/it] + 23%|██▎ | 1010/4476 [6:33:14<22:30:45, 23.38s/it] + +{'loss': 0.4173, 'learning_rate': 4.3977062008838307e-05, 'epoch': 0.68} + + 23%|██▎ | 1010/4476 [6:33:14<22:30:45, 23.38s/it] + 23%|██▎ | 1011/4476 [6:33:37<22:25:34, 23.30s/it] + 23%|██▎ | 1012/4476 [6:34:00<22:20:06, 23.21s/it] + 23%|██▎ | 1013/4476 [6:34:23<22:21:46, 23.25s/it] + 23%|██▎ | 1014/4476 [6:34:46<22:20:05, 23.23s/it] + 23%|██▎ | 1015/4476 [6:35:09<22:19:05, 23.21s/it] + 23%|██▎ | 1016/4476 [6:35:33<22:19:46, 23.23s/it] + 23%|██▎ | 1017/4476 [6:35:56<22:21:19, 23.27s/it] + 23%|██▎ | 1018/4476 [6:36:19<22:22:31, 23.29s/it] + 23%|██▎ | 1019/4476 [6:36:43<22:21:08, 23.28s/it] + 23%|██▎ | 1020/4476 [6:37:06<22:24:22, 23.34s/it] + +{'loss': 0.4049, 'learning_rate': 4.3862366326608975e-05, 'epoch': 0.68} + + 23%|██▎ | 1020/4476 [6:37:06<22:24:22, 23.34s/it] + 23%|██▎ | 1021/4476 [6:37:29<22:23:04, 23.32s/it] + 23%|██▎ | 1022/4476 [6:37:53<22:25:55, 23.38s/it] + 23%|██▎ | 1023/4476 [6:38:16<22:20:01, 23.28s/it] + 23%|██▎ | 1024/4476 [6:38:39<22:21:00, 23.31s/it] + 23%|██▎ | 1025/4476 [6:39:03<22:19:00, 23.28s/it] + 23%|██▎ | 1026/4476 [6:39:26<22:17:58, 23.27s/it] + 23%|██▎ | 1027/4476 [6:39:49<22:23:18, 23.37s/it] + 23%|██▎ | 1028/4476 [6:40:13<22:26:16, 23.43s/it] + 23%|██▎ | 1029/4476 [6:40:36<22:24:15, 23.40s/it] + 23%|██▎ | 1030/4476 [6:41:00<22:25:08, 23.42s/it] + +{'loss': 0.4143, 'learning_rate': 4.374674143415096e-05, 'epoch': 0.69} + + 23%|██▎ | 1030/4476 [6:41:00<22:25:08, 23.42s/it] + 23%|██▎ | 1031/4476 [6:41:23<22:18:14, 23.31s/it] + 23%|██▎ | 1032/4476 [6:41:46<22:16:07, 23.28s/it] + 23%|██▎ | 1033/4476 [6:42:09<22:14:31, 23.26s/it] + 23%|██▎ | 1034/4476 [6:42:33<22:17:26, 23.31s/it] + 23%|██▎ | 1035/4476 [6:42:56<22:21:24, 23.39s/it] + 23%|██▎ | 1036/4476 [6:43:19<22:13:19, 23.26s/it] + 23%|██▎ | 1037/4476 [6:43:42<22:12:16, 23.24s/it] + 23%|██▎ | 1038/4476 [6:44:05<22:09:18, 23.20s/it] + 23%|██▎ | 1039/4476 [6:44:29<22:13:14, 23.27s/it] + 23%|██▎ | 1040/4476 [6:44:52<22:14:52, 23.31s/it] + +{'loss': 0.4219, 'learning_rate': 4.363019302745334e-05, 'epoch': 0.7} + + 23%|██▎ | 1040/4476 [6:44:52<22:14:52, 23.31s/it] + 23%|██▎ | 1041/4476 [6:45:16<22:14:08, 23.30s/it] + 23%|██▎ | 1042/4476 [6:45:39<22:15:59, 23.34s/it] + 23%|██▎ | 1043/4476 [6:46:03<22:17:45, 23.38s/it] + 23%|██▎ | 1044/4476 [6:46:26<22:20:02, 23.43s/it] + 23%|██▎ | 1045/4476 [6:46:50<22:21:06, 23.45s/it] + 23%|██▎ | 1046/4476 [6:47:13<22:18:30, 23.41s/it] + 23%|██▎ | 1047/4476 [6:47:36<22:17:03, 23.40s/it] + 23%|██▎ | 1048/4476 [6:48:00<22:15:10, 23.37s/it] + 23%|██▎ | 1049/4476 [6:48:23<22:14:08, 23.36s/it] + 23%|██▎ | 1050/4476 [6:48:47<22:21:26, 23.49s/it] + +{'loss': 0.4152, 'learning_rate': 4.3512726847999987e-05, 'epoch': 0.7} + + 23%|██▎ | 1050/4476 [6:48:47<22:21:26, 23.49s/it] + 23%|██▎ | 1051/4476 [6:49:10<22:16:36, 23.42s/it] + 24%|██▎ | 1052/4476 [6:49:33<22:08:53, 23.29s/it] + 24%|██▎ | 1053/4476 [6:49:56<22:08:20, 23.28s/it] + 24%|██▎ | 1054/4476 [6:50:19<22:08:04, 23.29s/it] + 24%|██▎ | 1055/4476 [6:50:43<22:12:57, 23.38s/it] + 24%|██▎ | 1056/4476 [6:51:07<22:15:49, 23.44s/it] + 24%|██▎ | 1057/4476 [6:51:30<22:16:53, 23.46s/it] + 24%|██▎ | 1058/4476 [6:51:53<22:13:27, 23.41s/it] + 24%|██▎ | 1059/4476 [6:52:17<22:09:19, 23.34s/it] + 24%|██▎ | 1060/4476 [6:52:40<22:06:53, 23.31s/it] + +{'loss': 0.4153, 'learning_rate': 4.339434868248665e-05, 'epoch': 0.71} + + 24%|██▎ | 1060/4476 [6:52:40<22:06:53, 23.31s/it] + 24%|██▎ | 1061/4476 [6:53:03<22:07:10, 23.32s/it] + 24%|██▎ | 1062/4476 [6:53:27<22:12:46, 23.42s/it] + 24%|██▎ | 1063/4476 [6:53:50<22:06:59, 23.33s/it] + 24%|██▍ | 1064/4476 [6:54:13<22:09:36, 23.38s/it] + 24%|██▍ | 1065/4476 [6:54:37<22:11:22, 23.42s/it] + 24%|██▍ | 1066/4476 [6:55:00<22:02:18, 23.27s/it] + 24%|██▍ | 1067/4476 [6:55:23<22:03:35, 23.30s/it] + 24%|██▍ | 1068/4476 [6:55:46<21:58:05, 23.21s/it] + 24%|██▍ | 1069/4476 [6:56:10<21:59:23, 23.24s/it] + 24%|██▍ | 1070/4476 [6:56:33<21:59:02, 23.24s/it] + +{'loss': 0.4148, 'learning_rate': 4.3275064362535966e-05, 'epoch': 0.72} + + 24%|██▍ | 1070/4476 [6:56:33<21:59:02, 23.24s/it] + 24%|██▍ | 1071/4476 [6:56:56<22:00:52, 23.28s/it] + 24%|██▍ | 1072/4476 [6:57:19<21:54:27, 23.17s/it] + 24%|██▍ | 1073/4476 [6:57:42<21:52:27, 23.14s/it] + 24%|██▍ | 1074/4476 [6:58:05<21:55:00, 23.19s/it] + 24%|██▍ | 1075/4476 [6:58:29<21:55:45, 23.21s/it] + 24%|██▍ | 1076/4476 [6:58:52<21:52:58, 23.17s/it] + 24%|██▍ | 1077/4476 [6:59:15<21:53:37, 23.19s/it] + 24%|██▍ | 1078/4476 [6:59:38<21:57:13, 23.26s/it] + 24%|██▍ | 1079/4476 [7:00:01<21:52:14, 23.18s/it] + 24%|██▍ | 1080/4476 [7:00:25<21:54:46, 23.23s/it] + +{'loss': 0.4147, 'learning_rate': 4.315487976441014e-05, 'epoch': 0.72} + + 24%|██▍ | 1080/4476 [7:00:25<21:54:46, 23.23s/it] + 24%|██▍ | 1081/4476 [7:00:48<21:59:35, 23.32s/it] + 24%|██▍ | 1082/4476 [7:01:12<21:59:21, 23.32s/it] + 24%|██▍ | 1083/4476 [7:01:35<22:02:01, 23.38s/it] + 24%|██▍ | 1084/4476 [7:01:58<21:59:08, 23.33s/it] + 24%|██▍ | 1085/4476 [7:02:22<22:05:48, 23.46s/it] + 24%|██▍ | 1086/4476 [7:02:46<22:04:44, 23.45s/it] + 24%|██▍ | 1087/4476 [7:03:09<22:08:06, 23.51s/it] + 24%|██▍ | 1088/4476 [7:03:33<22:03:59, 23.45s/it] + 24%|██▍ | 1089/4476 [7:03:56<22:01:03, 23.40s/it] + 24%|██▍ | 1090/4476 [7:04:20<22:06:17, 23.50s/it] + +{'loss': 0.41, 'learning_rate': 4.303380080872145e-05, 'epoch': 0.73} + + 24%|██▍ | 1090/4476 [7:04:20<22:06:17, 23.50s/it] + 24%|██▍ | 1091/4476 [7:04:43<22:01:44, 23.43s/it] + 24%|██▍ | 1092/4476 [7:05:06<21:59:36, 23.40s/it] + 24%|██▍ | 1093/4476 [7:05:29<21:55:51, 23.34s/it] + 24%|██▍ | 1094/4476 [7:05:53<21:55:43, 23.34s/it] + 24%|██▍ | 1095/4476 [7:06:16<21:50:23, 23.25s/it] + 24%|██▍ | 1096/4476 [7:06:39<21:57:00, 23.38s/it] + 25%|██▍ | 1097/4476 [7:07:03<22:03:38, 23.50s/it] + 25%|██▍ | 1098/4476 [7:07:26<21:54:45, 23.35s/it] + 25%|██▍ | 1099/4476 [7:07:49<21:51:39, 23.30s/it] + 25%|██▍ | 1100/4476 [7:08:13<21:54:30, 23.36s/it] + +{'loss': 0.4119, 'learning_rate': 4.291183346014063e-05, 'epoch': 0.74} + + 25%|██▍ | 1100/4476 [7:08:13<21:54:30, 23.36s/it] + 25%|██▍ | 1101/4476 [7:08:36<21:51:56, 23.32s/it] + 25%|██▍ | 1102/4476 [7:08:59<21:50:11, 23.30s/it] + 25%|██▍ | 1103/4476 [7:09:23<21:50:39, 23.31s/it] + 25%|██▍ | 1104/4476 [7:09:46<21:49:55, 23.31s/it] + 25%|██▍ | 1105/4476 [7:10:10<21:57:27, 23.45s/it] + 25%|██▍ | 1106/4476 [7:10:34<22:02:36, 23.55s/it] + 25%|██▍ | 1107/4476 [7:10:57<22:01:20, 23.53s/it] + 25%|██▍ | 1108/4476 [7:11:21<22:04:21, 23.59s/it] + 25%|██▍ | 1109/4476 [7:11:44<21:59:59, 23.52s/it] + 25%|██▍ | 1110/4476 [7:12:08<21:58:19, 23.50s/it] + +{'loss': 0.4173, 'learning_rate': 4.278898372710296e-05, 'epoch': 0.74} + + 25%|██▍ | 1110/4476 [7:12:08<21:58:19, 23.50s/it] + 25%|██▍ | 1111/4476 [7:12:31<21:55:29, 23.46s/it] + 25%|██▍ | 1112/4476 [7:12:54<21:55:15, 23.46s/it] + 25%|██▍ | 1113/4476 [7:13:18<21:52:27, 23.42s/it] + 25%|██▍ | 1114/4476 [7:13:41<21:47:48, 23.34s/it] + 25%|██▍ | 1115/4476 [7:14:04<21:49:43, 23.38s/it] + 25%|██▍ | 1116/4476 [7:14:28<21:51:55, 23.43s/it] + 25%|██▍ | 1117/4476 [7:14:51<21:49:12, 23.39s/it] + 25%|██▍ | 1118/4476 [7:15:15<21:51:35, 23.44s/it] + 25%|██▌ | 1119/4476 [7:15:38<21:50:06, 23.42s/it] + 25%|██▌ | 1120/4476 [7:16:01<21:47:52, 23.38s/it] + +{'loss': 0.4119, 'learning_rate': 4.266525766151238e-05, 'epoch': 0.75} + + 25%|██▌ | 1120/4476 [7:16:01<21:47:52, 23.38s/it] + 25%|██▌ | 1121/4476 [7:16:25<21:43:34, 23.31s/it] + 25%|██▌ | 1122/4476 [7:16:48<21:44:03, 23.33s/it] + 25%|██▌ | 1123/4476 [7:17:11<21:43:57, 23.33s/it] + 25%|██▌ | 1124/4476 [7:17:35<21:42:05, 23.31s/it] + 25%|██▌ | 1125/4476 [7:17:58<21:37:31, 23.23s/it] + 25%|██▌ | 1126/4476 [7:18:21<21:43:52, 23.35s/it] + 25%|██▌ | 1127/4476 [7:18:44<21:40:57, 23.31s/it] + 25%|██▌ | 1128/4476 [7:19:08<21:40:04, 23.30s/it] + 25%|██▌ | 1129/4476 [7:19:31<21:39:29, 23.30s/it] + 25%|██▌ | 1130/4476 [7:19:54<21:41:39, 23.34s/it] + +{'loss': 0.4163, 'learning_rate': 4.254066135844326e-05, 'epoch': 0.76} + + 25%|██▌ | 1130/4476 [7:19:54<21:41:39, 23.34s/it] + 25%|██▌ | 1131/4476 [7:20:18<21:37:47, 23.28s/it] + 25%|██▌ | 1132/4476 [7:20:41<21:34:41, 23.23s/it] + 25%|██▌ | 1133/4476 [7:21:04<21:36:15, 23.27s/it] + 25%|██▌ | 1134/4476 [7:21:27<21:36:26, 23.28s/it] + 25%|██▌ | 1135/4476 [7:21:51<21:37:03, 23.29s/it] + 25%|██▌ | 1136/4476 [7:22:14<21:40:19, 23.36s/it] + 25%|██▌ | 1137/4476 [7:22:38<21:41:23, 23.39s/it] + 25%|██▌ | 1138/4476 [7:23:01<21:35:44, 23.29s/it] + 25%|██▌ | 1139/4476 [7:23:24<21:34:47, 23.28s/it] + 25%|██▌ | 1140/4476 [7:23:47<21:38:19, 23.35s/it] + +{'loss': 0.4104, 'learning_rate': 4.2415200955840184e-05, 'epoch': 0.76} + + 25%|██▌ | 1140/4476 [7:23:47<21:38:19, 23.35s/it] + 25%|██▌ | 1141/4476 [7:24:11<21:39:44, 23.38s/it] + 26%|██▌ | 1142/4476 [7:24:34<21:33:29, 23.28s/it] + 26%|██▌ | 1143/4476 [7:24:57<21:30:10, 23.23s/it] + 26%|██▌ | 1144/4476 [7:25:20<21:29:46, 23.23s/it] + 26%|██▌ | 1145/4476 [7:25:43<21:28:58, 23.22s/it] + 26%|██▌ | 1146/4476 [7:26:07<21:28:52, 23.22s/it] + 26%|██▌ | 1147/4476 [7:26:30<21:35:59, 23.36s/it] + 26%|██▌ | 1148/4476 [7:26:54<21:35:46, 23.36s/it] + 26%|██▌ | 1149/4476 [7:27:17<21:35:01, 23.35s/it] + 26%|██▌ | 1150/4476 [7:27:40<21:34:44, 23.36s/it] + +{'loss': 0.4045, 'learning_rate': 4.228888263421557e-05, 'epoch': 0.77} + + 26%|██▌ | 1150/4476 [7:27:40<21:34:44, 23.36s/it] + 26%|██▌ | 1151/4476 [7:28:04<21:32:39, 23.33s/it] + 26%|██▌ | 1152/4476 [7:28:27<21:28:54, 23.27s/it] + 26%|██▌ | 1153/4476 [7:28:50<21:26:44, 23.23s/it] + 26%|██▌ | 1154/4476 [7:29:13<21:29:12, 23.28s/it] + 26%|██▌ | 1155/4476 [7:29:37<21:36:33, 23.42s/it] + 26%|██▌ | 1156/4476 [7:30:00<21:25:05, 23.22s/it] + 26%|██▌ | 1157/4476 [7:30:23<21:28:52, 23.30s/it] + 26%|██▌ | 1158/4476 [7:30:47<21:31:27, 23.35s/it] + 26%|██▌ | 1159/4476 [7:31:10<21:27:26, 23.29s/it] + 26%|██▌ | 1160/4476 [7:31:33<21:27:47, 23.30s/it] + +{'loss': 0.413, 'learning_rate': 4.216171261634521e-05, 'epoch': 0.78} + + 26%|██▌ | 1160/4476 [7:31:33<21:27:47, 23.30s/it] + 26%|██▌ | 1161/4476 [7:31:57<21:27:09, 23.30s/it] + 26%|██▌ | 1162/4476 [7:32:20<21:30:33, 23.37s/it] + 26%|██▌ | 1163/4476 [7:32:43<21:26:44, 23.30s/it] + 26%|██▌ | 1164/4476 [7:33:07<21:26:53, 23.31s/it] + 26%|██▌ | 1165/4476 [7:33:30<21:25:58, 23.30s/it] + 26%|██▌ | 1166/4476 [7:33:53<21:21:10, 23.22s/it] + 26%|██▌ | 1167/4476 [7:34:16<21:22:01, 23.25s/it] + 26%|██▌ | 1168/4476 [7:34:40<21:29:44, 23.39s/it] + 26%|██▌ | 1169/4476 [7:35:03<21:28:20, 23.37s/it] + 26%|██▌ | 1170/4476 [7:35:27<21:29:06, 23.40s/it] + +{'loss': 0.4112, 'learning_rate': 4.2033697166961716e-05, 'epoch': 0.78} + + 26%|██▌ | 1170/4476 [7:35:27<21:29:06, 23.40s/it] + 26%|██▌ | 1171/4476 [7:35:51<21:35:32, 23.52s/it] + 26%|██▌ | 1172/4476 [7:36:14<21:33:08, 23.48s/it] + 26%|██▌ | 1173/4476 [7:36:37<21:27:29, 23.39s/it] + 26%|██▌ | 1174/4476 [7:37:00<21:21:40, 23.29s/it] + 26%|██▋ | 1175/4476 [7:37:24<21:28:28, 23.42s/it] + 26%|██▋ | 1176/4476 [7:37:48<21:34:05, 23.53s/it] + 26%|██▋ | 1177/4476 [7:38:11<21:33:13, 23.52s/it] + 26%|██▋ | 1178/4476 [7:38:34<21:27:03, 23.42s/it] + 26%|██▋ | 1179/4476 [7:38:58<21:23:59, 23.37s/it] + 26%|██▋ | 1180/4476 [7:39:21<21:25:14, 23.40s/it] + +{'loss': 0.4018, 'learning_rate': 4.1904842592445906e-05, 'epoch': 0.79} + + 26%|██▋ | 1180/4476 [7:39:21<21:25:14, 23.40s/it] + 26%|██▋ | 1181/4476 [7:39:44<21:24:33, 23.39s/it] + 26%|██▋ | 1182/4476 [7:40:08<21:23:25, 23.38s/it] + 26%|██▋ | 1183/4476 [7:40:31<21:26:08, 23.43s/it] + 26%|██▋ | 1184/4476 [7:40:55<21:29:25, 23.50s/it] + 26%|██▋ | 1185/4476 [7:41:18<21:24:22, 23.42s/it] + 26%|██▋ | 1186/4476 [7:41:42<21:21:01, 23.36s/it] + 27%|██▋ | 1187/4476 [7:42:05<21:17:30, 23.31s/it] + 27%|██▋ | 1188/4476 [7:42:28<21:16:09, 23.29s/it] + 27%|██▋ | 1189/4476 [7:42:51<21:15:26, 23.28s/it] + 27%|██▋ | 1190/4476 [7:43:15<21:15:41, 23.29s/it] + +{'loss': 0.4068, 'learning_rate': 4.177515524051609e-05, 'epoch': 0.8} + + 27%|██▋ | 1190/4476 [7:43:15<21:15:41, 23.29s/it] + 27%|██▋ | 1191/4476 [7:43:38<21:15:59, 23.31s/it] + 27%|██▋ | 1192/4476 [7:44:01<21:13:21, 23.26s/it] + 27%|██▋ | 1193/4476 [7:44:25<21:22:05, 23.43s/it] + 27%|██▋ | 1194/4476 [7:44:48<21:19:01, 23.38s/it] + 27%|██▋ | 1195/4476 [7:45:11<21:12:02, 23.26s/it] + 27%|██▋ | 1196/4476 [7:45:34<21:12:27, 23.28s/it] + 27%|██▋ | 1197/4476 [7:45:57<21:04:51, 23.14s/it] + 27%|██▋ | 1198/4476 [7:46:20<21:02:30, 23.11s/it] + 27%|██▋ | 1199/4476 [7:46:44<21:05:44, 23.17s/it] + 27%|██▋ | 1200/4476 [7:47:07<21:07:22, 23.21s/it] + +{'loss': 0.4029, 'learning_rate': 4.1644641499915454e-05, 'epoch': 0.8} + + 27%|██▋ | 1200/4476 [7:47:07<21:07:22, 23.21s/it][INFO|trainer.py:2939] 2023-11-12 11:10:54,633 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1200 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 11:10:54,664 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 11:10:54,664 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 11:10:54,665 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1200/added_tokens.json + + 27%|██▋ | 1201/4476 [7:47:30<21:13:07, 23.32s/it] + 27%|██▋ | 1202/4476 [7:47:54<21:15:24, 23.37s/it] + 27%|██▋ | 1203/4476 [7:48:18<21:20:39, 23.48s/it] + 27%|██▋ | 1204/4476 [7:48:41<21:22:07, 23.51s/it] + 27%|██▋ | 1205/4476 [7:49:05<21:24:57, 23.57s/it] + 27%|██▋ | 1206/4476 [7:49:28<21:18:21, 23.46s/it] + 27%|██▋ | 1207/4476 [7:49:51<21:14:46, 23.40s/it] + 27%|██▋ | 1208/4476 [7:50:15<21:18:41, 23.48s/it] + 27%|██▋ | 1209/4476 [7:50:39<21:23:43, 23.58s/it] + 27%|██▋ | 1210/4476 [7:51:02<21:20:02, 23.52s/it] + +{'loss': 0.4009, 'learning_rate': 4.151330780009726e-05, 'epoch': 0.81} + + 27%|██▋ | 1210/4476 [7:51:02<21:20:02, 23.52s/it] + 27%|██▋ | 1211/4476 [7:51:26<21:18:58, 23.50s/it] + 27%|██▋ | 1212/4476 [7:51:49<21:10:50, 23.36s/it] + 27%|██▋ | 1213/4476 [7:52:12<21:14:17, 23.43s/it] + 27%|██▋ | 1214/4476 [7:52:36<21:19:42, 23.54s/it] + 27%|██▋ | 1215/4476 [7:52:59<21:15:21, 23.47s/it] + 27%|██▋ | 1216/4476 [7:53:23<21:09:57, 23.37s/it] + 27%|██▋ | 1217/4476 [7:53:46<21:05:28, 23.30s/it] + 27%|██▋ | 1218/4476 [7:54:09<21:08:34, 23.36s/it] + 27%|██▋ | 1219/4476 [7:54:33<21:10:10, 23.40s/it] + 27%|██▋ | 1220/4476 [7:54:56<21:08:25, 23.37s/it] + +{'loss': 0.4073, 'learning_rate': 4.1381160610908134e-05, 'epoch': 0.82} + + 27%|██▋ | 1220/4476 [7:54:56<21:08:25, 23.37s/it] + 27%|██▋ | 1221/4476 [7:55:19<21:07:29, 23.36s/it] + 27%|██▋ | 1222/4476 [7:55:43<21:06:51, 23.36s/it] + 27%|██▋ | 1223/4476 [7:56:06<21:04:29, 23.32s/it] + 27%|██▋ | 1224/4476 [7:56:30<21:10:20, 23.44s/it] + 27%|██▋ | 1225/4476 [7:56:53<21:07:42, 23.40s/it] + 27%|██▋ | 1226/4476 [7:57:16<21:06:11, 23.38s/it] + 27%|██▋ | 1227/4476 [7:57:40<21:10:04, 23.45s/it] + 27%|██▋ | 1228/4476 [7:58:03<21:07:11, 23.41s/it] + 27%|██▋ | 1229/4476 [7:58:27<21:06:30, 23.40s/it] + 27%|██▋ | 1230/4476 [7:58:50<21:05:14, 23.39s/it] + +{'loss': 0.4138, 'learning_rate': 4.124820644226936e-05, 'epoch': 0.82} + + 27%|██▋ | 1230/4476 [7:58:50<21:05:14, 23.39s/it] + 28%|██▊ | 1231/4476 [7:59:13<21:04:27, 23.38s/it] + 28%|██▊ | 1232/4476 [7:59:37<21:10:35, 23.50s/it] + 28%|██▊ | 1233/4476 [8:00:01<21:11:01, 23.52s/it] + 28%|██▊ | 1234/4476 [8:00:24<21:10:56, 23.52s/it] + 28%|██▊ | 1235/4476 [8:00:48<21:07:53, 23.47s/it] + 28%|██▊ | 1236/4476 [8:01:11<21:04:54, 23.42s/it] + 28%|██▊ | 1237/4476 [8:01:34<21:01:56, 23.38s/it] + 28%|██▊ | 1238/4476 [8:01:57<20:56:13, 23.28s/it] + 28%|██▊ | 1239/4476 [8:02:21<20:56:45, 23.30s/it] + 28%|██▊ | 1240/4476 [8:02:44<20:56:53, 23.30s/it] + +{'loss': 0.4139, 'learning_rate': 4.111445184385616e-05, 'epoch': 0.83} + + 28%|██▊ | 1240/4476 [8:02:44<20:56:53, 23.30s/it] + 28%|██▊ | 1241/4476 [8:03:08<21:03:25, 23.43s/it] + 28%|██▊ | 1242/4476 [8:03:31<21:00:09, 23.38s/it] + 28%|██▊ | 1243/4476 [8:03:54<20:59:39, 23.38s/it] + 28%|██▊ | 1244/4476 [8:04:18<20:57:40, 23.35s/it] + 28%|██▊ | 1245/4476 [8:04:41<20:59:33, 23.39s/it] + 28%|██▊ | 1246/4476 [8:05:04<20:54:43, 23.31s/it] + 28%|██▊ | 1247/4476 [8:05:27<20:53:22, 23.29s/it] + 28%|██▊ | 1248/4476 [8:05:51<20:51:26, 23.26s/it] + 28%|██▊ | 1249/4476 [8:06:14<20:55:28, 23.34s/it] + 28%|██▊ | 1250/4476 [8:06:37<20:54:34, 23.33s/it] + +{'loss': 0.4062, 'learning_rate': 4.097990340477507e-05, 'epoch': 0.84} + + 28%|██▊ | 1250/4476 [8:06:37<20:54:34, 23.33s/it] + 28%|██▊ | 1251/4476 [8:07:01<20:50:52, 23.27s/it] + 28%|██▊ | 1252/4476 [8:07:24<20:55:32, 23.37s/it] + 28%|██▊ | 1253/4476 [8:07:48<20:58:07, 23.42s/it] + 28%|██▊ | 1254/4476 [8:08:11<20:55:48, 23.39s/it] + 28%|██▊ | 1255/4476 [8:08:34<20:57:08, 23.42s/it] + 28%|██▊ | 1256/4476 [8:08:58<20:58:33, 23.45s/it] + 28%|██▊ | 1257/4476 [8:09:21<20:52:08, 23.34s/it] + 28%|██▊ | 1258/4476 [8:09:44<20:50:14, 23.31s/it] + 28%|██▊ | 1259/4476 [8:10:08<20:49:38, 23.31s/it] + 28%|██▊ | 1260/4476 [8:10:31<20:51:34, 23.35s/it] + +{'loss': 0.4044, 'learning_rate': 4.0844567753239276e-05, 'epoch': 0.84} + + 28%|██▊ | 1260/4476 [8:10:31<20:51:34, 23.35s/it] + 28%|██▊ | 1261/4476 [8:10:55<20:56:06, 23.44s/it] + 28%|██▊ | 1262/4476 [8:11:18<20:53:40, 23.40s/it] + 28%|██▊ | 1263/4476 [8:11:41<20:50:13, 23.35s/it] + 28%|██▊ | 1264/4476 [8:12:05<20:55:17, 23.45s/it] + 28%|██▊ | 1265/4476 [8:12:28<20:47:00, 23.30s/it] + 28%|██▊ | 1266/4476 [8:12:51<20:39:51, 23.17s/it] + 28%|██▊ | 1267/4476 [8:13:14<20:34:21, 23.08s/it] + 28%|██▊ | 1268/4476 [8:13:37<20:37:20, 23.14s/it] + 28%|██▊ | 1269/4476 [8:14:00<20:39:40, 23.19s/it] + 28%|██▊ | 1270/4476 [8:14:23<20:36:10, 23.14s/it] + +{'loss': 0.3978, 'learning_rate': 4.070845155624221e-05, 'epoch': 0.85} + + 28%|██▊ | 1270/4476 [8:14:23<20:36:10, 23.14s/it] + 28%|██▊ | 1271/4476 [8:14:47<20:38:46, 23.19s/it] + 28%|██▊ | 1272/4476 [8:15:10<20:43:40, 23.29s/it] + 28%|██▊ | 1273/4476 [8:15:34<20:47:29, 23.37s/it] + 28%|██▊ | 1274/4476 [8:15:57<20:46:34, 23.36s/it] + 28%|██▊ | 1275/4476 [8:16:20<20:43:45, 23.31s/it] + 29%|██▊ | 1276/4476 [8:16:44<20:46:05, 23.36s/it] + 29%|██▊ | 1277/4476 [8:17:07<20:48:07, 23.41s/it] + 29%|██▊ | 1278/4476 [8:17:30<20:45:26, 23.37s/it] + 29%|██▊ | 1279/4476 [8:17:54<20:43:04, 23.33s/it] + 29%|██▊ | 1280/4476 [8:18:17<20:42:23, 23.32s/it] + +{'loss': 0.4102, 'learning_rate': 4.0571561519228984e-05, 'epoch': 0.86} + + 29%|██▊ | 1280/4476 [8:18:17<20:42:23, 23.32s/it] + 29%|██▊ | 1281/4476 [8:18:41<20:47:40, 23.43s/it] + 29%|██▊ | 1282/4476 [8:19:04<20:43:16, 23.36s/it] + 29%|██▊ | 1283/4476 [8:19:27<20:40:53, 23.32s/it] + 29%|██▊ | 1284/4476 [8:19:51<20:42:51, 23.36s/it] + 29%|██▊ | 1285/4476 [8:20:14<20:37:38, 23.27s/it] + 29%|██▊ | 1286/4476 [8:20:37<20:33:17, 23.20s/it] + 29%|██▉ | 1287/4476 [8:21:00<20:36:50, 23.27s/it] + 29%|██▉ | 1288/4476 [8:21:24<20:39:40, 23.33s/it] + 29%|██▉ | 1289/4476 [8:21:47<20:35:18, 23.26s/it] + 29%|██▉ | 1290/4476 [8:22:10<20:38:37, 23.33s/it] + +{'loss': 0.4052, 'learning_rate': 4.043390438576616e-05, 'epoch': 0.86} + + 29%|██▉ | 1290/4476 [8:22:10<20:38:37, 23.33s/it] + 29%|██▉ | 1291/4476 [8:22:34<20:42:32, 23.41s/it] + 29%|██▉ | 1292/4476 [8:22:57<20:41:16, 23.39s/it] + 29%|██▉ | 1293/4476 [8:23:20<20:40:55, 23.39s/it] + 29%|██▉ | 1294/4476 [8:23:44<20:39:24, 23.37s/it] + 29%|██▉ | 1295/4476 [8:24:08<20:45:45, 23.50s/it] + 29%|██▉ | 1296/4476 [8:24:31<20:41:39, 23.43s/it] + 29%|██▉ | 1297/4476 [8:24:54<20:42:27, 23.45s/it] + 29%|██▉ | 1298/4476 [8:25:18<20:40:13, 23.42s/it] + 29%|██▉ | 1299/4476 [8:25:41<20:32:32, 23.28s/it] + 29%|██▉ | 1300/4476 [8:26:04<20:31:30, 23.27s/it] + +{'loss': 0.4048, 'learning_rate': 4.029548693720949e-05, 'epoch': 0.87} + + 29%|██▉ | 1300/4476 [8:26:04<20:31:30, 23.27s/it] + 29%|██▉ | 1301/4476 [8:26:27<20:29:06, 23.23s/it] + 29%|██▉ | 1302/4476 [8:26:50<20:29:34, 23.24s/it] + 29%|██▉ | 1303/4476 [8:27:14<20:29:03, 23.24s/it] + 29%|██▉ | 1304/4476 [8:27:37<20:30:20, 23.27s/it] + 29%|██▉ | 1305/4476 [8:28:00<20:32:32, 23.32s/it] + 29%|██▉ | 1306/4476 [8:28:24<20:35:24, 23.38s/it] + 29%|██▉ | 1307/4476 [8:28:47<20:33:16, 23.35s/it] + 29%|██▉ | 1308/4476 [8:29:10<20:33:13, 23.36s/it] + 29%|██▉ | 1309/4476 [8:29:34<20:30:28, 23.31s/it] + 29%|██▉ | 1310/4476 [8:29:57<20:35:25, 23.41s/it] + +{'loss': 0.4008, 'learning_rate': 4.0156315992369864e-05, 'epoch': 0.88} + + 29%|██▉ | 1310/4476 [8:29:57<20:35:25, 23.41s/it] + 29%|██▉ | 1311/4476 [8:30:21<20:32:55, 23.37s/it] + 29%|██▉ | 1312/4476 [8:30:44<20:29:31, 23.32s/it] + 29%|██▉ | 1313/4476 [8:31:07<20:33:44, 23.40s/it] + 29%|██▉ | 1314/4476 [8:31:31<20:31:37, 23.37s/it] + 29%|██▉ | 1315/4476 [8:31:54<20:25:26, 23.26s/it] + 29%|██▉ | 1316/4476 [8:32:17<20:29:53, 23.35s/it] + 29%|██▉ | 1317/4476 [8:32:40<20:23:52, 23.25s/it] + 29%|██▉ | 1318/4476 [8:33:03<20:22:29, 23.23s/it] + 29%|██▉ | 1319/4476 [8:33:27<20:25:31, 23.29s/it] + 29%|██▉ | 1320/4476 [8:33:50<20:28:01, 23.35s/it] + +{'loss': 0.4038, 'learning_rate': 4.001639840717741e-05, 'epoch': 0.88} + + 29%|██▉ | 1320/4476 [8:33:50<20:28:01, 23.35s/it] + 30%|██▉ | 1321/4476 [8:34:14<20:33:31, 23.46s/it] + 30%|██▉ | 1322/4476 [8:34:38<20:34:23, 23.48s/it] + 30%|██▉ | 1323/4476 [8:35:01<20:34:42, 23.50s/it] + 30%|██▉ | 1324/4476 [8:35:25<20:33:36, 23.48s/it] + 30%|██▉ | 1325/4476 [8:35:48<20:33:24, 23.49s/it] + 30%|██▉ | 1326/4476 [8:36:11<20:29:46, 23.42s/it] + 30%|██▉ | 1327/4476 [8:36:35<20:28:34, 23.41s/it] + 30%|██▉ | 1328/4476 [8:36:59<20:33:48, 23.52s/it] + 30%|██▉ | 1329/4476 [8:37:22<20:32:03, 23.49s/it] + 30%|██▉ | 1330/4476 [8:37:45<20:29:22, 23.45s/it] + +{'loss': 0.408, 'learning_rate': 3.9875741074343744e-05, 'epoch': 0.89} + + 30%|██▉ | 1330/4476 [8:37:45<20:29:22, 23.45s/it] + 30%|██▉ | 1331/4476 [8:38:09<20:26:01, 23.39s/it] + 30%|██▉ | 1332/4476 [8:38:32<20:22:46, 23.34s/it] + 30%|██▉ | 1333/4476 [8:38:55<20:21:44, 23.32s/it] + 30%|██▉ | 1334/4476 [8:39:18<20:18:34, 23.27s/it] + 30%|██▉ | 1335/4476 [8:39:42<20:21:17, 23.33s/it] + 30%|██▉ | 1336/4476 [8:40:05<20:23:21, 23.38s/it] + 30%|██▉ | 1337/4476 [8:40:29<20:24:29, 23.41s/it] + 30%|██▉ | 1338/4476 [8:40:52<20:20:14, 23.33s/it] + 30%|██▉ | 1339/4476 [8:41:15<20:22:47, 23.39s/it] + 30%|██▉ | 1340/4476 [8:41:39<20:28:59, 23.51s/it] + +{'loss': 0.406, 'learning_rate': 3.973435092302239e-05, 'epoch': 0.9} + + 30%|██▉ | 1340/4476 [8:41:39<20:28:59, 23.51s/it] + 30%|██▉ | 1341/4476 [8:42:03<20:27:47, 23.50s/it] + 30%|██▉ | 1342/4476 [8:42:26<20:29:41, 23.54s/it] + 30%|███ | 1343/4476 [8:42:49<20:21:58, 23.40s/it] + 30%|███ | 1344/4476 [8:43:12<20:18:28, 23.34s/it] + 30%|███ | 1345/4476 [8:43:35<20:10:13, 23.19s/it] + 30%|███ | 1346/4476 [8:43:59<20:11:52, 23.23s/it] + 30%|███ | 1347/4476 [8:44:22<20:07:54, 23.16s/it] + 30%|███ | 1348/4476 [8:44:45<20:11:49, 23.24s/it] + 30%|███ | 1349/4476 [8:45:08<20:07:42, 23.17s/it] + 30%|███ | 1350/4476 [8:45:31<20:05:47, 23.14s/it] + +{'loss': 0.3991, 'learning_rate': 3.959223491846749e-05, 'epoch': 0.9} + + 30%|███ | 1350/4476 [8:45:31<20:05:47, 23.14s/it] + 30%|███ | 1351/4476 [8:45:55<20:13:50, 23.31s/it] + 30%|███ | 1352/4476 [8:46:18<20:13:18, 23.30s/it] + 30%|███ | 1353/4476 [8:46:41<20:12:24, 23.29s/it] + 30%|███ | 1354/4476 [8:47:05<20:12:56, 23.31s/it] + 30%|███ | 1355/4476 [8:47:28<20:10:35, 23.27s/it] + 30%|███ | 1356/4476 [8:47:51<20:04:22, 23.16s/it] + 30%|███ | 1357/4476 [8:48:14<20:02:24, 23.13s/it] + 30%|███ | 1358/4476 [8:48:37<20:03:55, 23.17s/it] + 30%|███ | 1359/4476 [8:49:01<20:06:26, 23.22s/it] + 30%|███ | 1360/4476 [8:49:24<20:08:38, 23.27s/it] + +{'loss': 0.4091, 'learning_rate': 3.94494000616906e-05, 'epoch': 0.91} + + 30%|███ | 1360/4476 [8:49:24<20:08:38, 23.27s/it] + 30%|███ | 1361/4476 [8:49:47<20:03:36, 23.18s/it] + 30%|███ | 1362/4476 [8:50:10<20:05:04, 23.22s/it] + 30%|███ | 1363/4476 [8:50:33<20:05:20, 23.23s/it] + 30%|███ | 1364/4476 [8:50:57<20:03:34, 23.21s/it] + 30%|███ | 1365/4476 [8:51:20<20:00:05, 23.15s/it] + 31%|███ | 1366/4476 [8:51:43<20:00:21, 23.16s/it] + 31%|███ | 1367/4476 [8:52:06<19:57:33, 23.11s/it] + 31%|███ | 1368/4476 [8:52:29<20:02:49, 23.22s/it] + 31%|███ | 1369/4476 [8:52:53<20:06:51, 23.31s/it] + 31%|███ | 1370/4476 [8:53:16<20:07:22, 23.32s/it] + +{'loss': 0.4, 'learning_rate': 3.93058533891159e-05, 'epoch': 0.92} + + 31%|███ | 1370/4476 [8:53:16<20:07:22, 23.32s/it] + 31%|███ | 1371/4476 [8:53:39<20:04:12, 23.27s/it] + 31%|███ | 1372/4476 [8:54:02<20:02:52, 23.25s/it] + 31%|███ | 1373/4476 [8:54:26<20:08:45, 23.37s/it] + 31%|███ | 1374/4476 [8:54:50<20:10:18, 23.41s/it] + 31%|███ | 1375/4476 [8:55:13<20:07:29, 23.36s/it] + 31%|███ | 1376/4476 [8:55:36<20:05:36, 23.33s/it] + 31%|███ | 1377/4476 [8:55:59<20:01:59, 23.27s/it] + 31%|███ | 1378/4476 [8:56:23<20:00:58, 23.26s/it] + 31%|███ | 1379/4476 [8:56:46<20:01:44, 23.28s/it] + 31%|███ | 1380/4476 [8:57:09<20:01:42, 23.29s/it] + +{'loss': 0.4112, 'learning_rate': 3.916160197223344e-05, 'epoch': 0.92} + + 31%|███ | 1380/4476 [8:57:09<20:01:42, 23.29s/it] + 31%|███ | 1381/4476 [8:57:33<20:05:17, 23.37s/it] + 31%|███ | 1382/4476 [8:57:56<20:05:03, 23.37s/it] + 31%|███ | 1383/4476 [8:58:20<20:05:46, 23.39s/it] + 31%|███ | 1384/4476 [8:58:43<20:04:15, 23.37s/it] + 31%|███ | 1385/4476 [8:59:06<20:04:18, 23.38s/it] + 31%|███ | 1386/4476 [8:59:30<20:09:35, 23.49s/it] + 31%|███ | 1387/4476 [8:59:53<20:03:26, 23.38s/it] + 31%|███ | 1388/4476 [9:00:16<19:58:41, 23.29s/it] + 31%|███ | 1389/4476 [9:00:39<19:58:07, 23.29s/it] + 31%|███ | 1390/4476 [9:01:03<19:55:11, 23.24s/it] + +{'loss': 0.4024, 'learning_rate': 3.901665291725091e-05, 'epoch': 0.93} + + 31%|███ | 1390/4476 [9:01:03<19:55:11, 23.24s/it] + 31%|███ | 1391/4476 [9:01:26<19:55:38, 23.25s/it] + 31%|███ | 1392/4476 [9:01:49<19:58:57, 23.33s/it] + 31%|███ | 1393/4476 [9:02:13<19:59:20, 23.34s/it] + 31%|███ | 1394/4476 [9:02:36<19:57:55, 23.32s/it] + 31%|███ | 1395/4476 [9:02:59<19:56:35, 23.30s/it] + 31%|███ | 1396/4476 [9:03:22<19:52:24, 23.23s/it] + 31%|███ | 1397/4476 [9:03:46<19:54:12, 23.27s/it] + 31%|███ | 1398/4476 [9:04:09<19:52:56, 23.25s/it] + 31%|███▏ | 1399/4476 [9:04:32<19:53:05, 23.26s/it] + 31%|███▏ | 1400/4476 [9:04:56<19:57:05, 23.35s/it] + +{'loss': 0.4048, 'learning_rate': 3.887101336474346e-05, 'epoch': 0.94} + + 31%|███▏ | 1400/4476 [9:04:56<19:57:05, 23.35s/it][INFO|trainer.py:2939] 2023-11-12 12:28:43,500 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1400 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 12:28:43,531 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 12:28:43,531 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1400/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 12:28:43,531 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1400/added_tokens.json + + 31%|███▏ | 1401/4476 [9:05:19<19:56:51, 23.35s/it] + 31%|███▏ | 1402/4476 [9:05:43<19:59:40, 23.42s/it] + 31%|███▏ | 1403/4476 [9:06:06<19:57:49, 23.39s/it] + 31%|███▏ | 1404/4476 [9:06:29<19:56:44, 23.37s/it] + 31%|███▏ | 1405/4476 [9:06:53<19:54:56, 23.35s/it] + 31%|███▏ | 1406/4476 [9:07:16<19:52:10, 23.30s/it] + 31%|███▏ | 1407/4476 [9:07:40<19:57:56, 23.42s/it] + 31%|███▏ | 1408/4476 [9:08:03<19:59:11, 23.45s/it] + 31%|███▏ | 1409/4476 [9:08:27<20:01:16, 23.50s/it] + 32%|███▏ | 1410/4476 [9:08:50<19:58:11, 23.45s/it] + +{'loss': 0.4112, 'learning_rate': 3.8724690489302004e-05, 'epoch': 0.94} + + 32%|███▏ | 1410/4476 [9:08:50<19:58:11, 23.45s/it] + 32%|███▏ | 1411/4476 [9:09:13<19:53:05, 23.36s/it] + 32%|███▏ | 1412/4476 [9:09:36<19:44:00, 23.19s/it] + 32%|███▏ | 1413/4476 [9:09:59<19:47:17, 23.26s/it] + 32%|███▏ | 1414/4476 [9:10:23<19:49:52, 23.32s/it] + 32%|███▏ | 1415/4476 [9:10:46<19:43:11, 23.19s/it] + 32%|███▏ | 1416/4476 [9:11:09<19:45:31, 23.25s/it] + 32%|███▏ | 1417/4476 [9:11:33<19:50:48, 23.36s/it] + 32%|███▏ | 1418/4476 [9:11:56<19:50:21, 23.36s/it] + 32%|███▏ | 1419/4476 [9:12:20<19:51:43, 23.39s/it] + 32%|███▏ | 1420/4476 [9:12:43<19:49:34, 23.36s/it] + +{'loss': 0.3947, 'learning_rate': 3.857769149917973e-05, 'epoch': 0.95} + + 32%|███▏ | 1420/4476 [9:12:43<19:49:34, 23.36s/it] + 32%|███▏ | 1421/4476 [9:13:06<19:48:06, 23.33s/it] + 32%|███▏ | 1422/4476 [9:13:30<19:52:56, 23.44s/it] + 32%|███▏ | 1423/4476 [9:13:53<19:50:21, 23.39s/it] + 32%|███▏ | 1424/4476 [9:14:16<19:42:33, 23.25s/it] + 32%|███▏ | 1425/4476 [9:14:39<19:43:52, 23.28s/it] + 32%|███▏ | 1426/4476 [9:15:03<19:47:35, 23.36s/it] + 32%|███▏ | 1427/4476 [9:15:26<19:47:24, 23.37s/it] + 32%|███▏ | 1428/4476 [9:15:49<19:43:39, 23.30s/it] + 32%|███▏ | 1429/4476 [9:16:12<19:39:15, 23.22s/it] + 32%|███▏ | 1430/4476 [9:16:36<19:38:55, 23.22s/it] + +{'loss': 0.4005, 'learning_rate': 3.843002363593707e-05, 'epoch': 0.96} + + 32%|███▏ | 1430/4476 [9:16:36<19:38:55, 23.22s/it] + 32%|███▏ | 1431/4476 [9:16:59<19:40:16, 23.26s/it] + 32%|███▏ | 1432/4476 [9:17:22<19:36:47, 23.20s/it] + 32%|███▏ | 1433/4476 [9:17:46<19:44:08, 23.35s/it] + 32%|███▏ | 1434/4476 [9:18:09<19:42:40, 23.33s/it] + 32%|███▏ | 1435/4476 [9:18:32<19:36:10, 23.21s/it] + 32%|███▏ | 1436/4476 [9:18:55<19:37:18, 23.24s/it] + 32%|███▏ | 1437/4476 [9:19:19<19:37:32, 23.25s/it] + 32%|███▏ | 1438/4476 [9:19:42<19:44:19, 23.39s/it] + 32%|███▏ | 1439/4476 [9:20:06<19:49:21, 23.50s/it] + 32%|███▏ | 1440/4476 [9:20:29<19:44:17, 23.40s/it] + +{'loss': 0.3976, 'learning_rate': 3.828169417408488e-05, 'epoch': 0.96} + + 32%|███▏ | 1440/4476 [9:20:29<19:44:17, 23.40s/it] + 32%|███▏ | 1441/4476 [9:20:52<19:41:52, 23.36s/it] + 32%|███▏ | 1442/4476 [9:21:16<19:41:24, 23.36s/it] + 32%|███▏ | 1443/4476 [9:21:39<19:40:07, 23.35s/it] + 32%|███▏ | 1444/4476 [9:22:03<19:41:12, 23.37s/it] + 32%|███▏ | 1445/4476 [9:22:26<19:42:02, 23.40s/it] + 32%|███▏ | 1446/4476 [9:22:49<19:39:08, 23.35s/it] + 32%|███▏ | 1447/4476 [9:23:13<19:43:15, 23.44s/it] + 32%|███▏ | 1448/4476 [9:23:37<19:48:16, 23.55s/it] + 32%|███▏ | 1449/4476 [9:24:00<19:46:58, 23.53s/it] + 32%|███▏ | 1450/4476 [9:24:24<19:45:32, 23.51s/it] + +{'loss': 0.4006, 'learning_rate': 3.8132710420726146e-05, 'epoch': 0.97} + + 32%|███▏ | 1450/4476 [9:24:24<19:45:32, 23.51s/it] + 32%|███▏ | 1451/4476 [9:24:47<19:42:13, 23.45s/it] + 32%|███▏ | 1452/4476 [9:25:10<19:41:42, 23.45s/it] + 32%|███▏ | 1453/4476 [9:25:34<19:40:02, 23.42s/it] + 32%|███▏ | 1454/4476 [9:25:57<19:38:11, 23.39s/it] + 33%|███▎ | 1455/4476 [9:26:21<19:41:26, 23.46s/it] + 33%|███▎ | 1456/4476 [9:26:44<19:45:34, 23.55s/it] + 33%|███▎ | 1457/4476 [9:27:08<19:44:19, 23.54s/it] + 33%|███▎ | 1458/4476 [9:27:32<19:45:50, 23.58s/it] + 33%|███▎ | 1459/4476 [9:27:55<19:40:21, 23.47s/it] + 33%|███▎ | 1460/4476 [9:28:18<19:37:09, 23.42s/it] + +{'loss': 0.398, 'learning_rate': 3.7983079715195984e-05, 'epoch': 0.98} + + 33%|███▎ | 1460/4476 [9:28:18<19:37:09, 23.42s/it] + 33%|███▎ | 1461/4476 [9:28:41<19:34:44, 23.38s/it] + 33%|███▎ | 1462/4476 [9:29:05<19:33:14, 23.36s/it] + 33%|███▎ | 1463/4476 [9:29:28<19:38:12, 23.46s/it] + 33%|███▎ | 1464/4476 [9:29:52<19:37:18, 23.45s/it] + 33%|███▎ | 1465/4476 [9:30:15<19:31:44, 23.35s/it] + 33%|███▎ | 1466/4476 [9:30:38<19:32:59, 23.38s/it] + 33%|███▎ | 1467/4476 [9:31:02<19:29:45, 23.33s/it] + 33%|███▎ | 1468/4476 [9:31:25<19:35:49, 23.45s/it] + 33%|███▎ | 1469/4476 [9:31:49<19:31:40, 23.38s/it] + 33%|███▎ | 1470/4476 [9:32:12<19:29:23, 23.34s/it] + +{'loss': 0.3987, 'learning_rate': 3.78328094287001e-05, 'epoch': 0.99} + + 33%|███▎ | 1470/4476 [9:32:12<19:29:23, 23.34s/it] + 33%|███▎ | 1471/4476 [9:32:35<19:27:43, 23.32s/it] + 33%|███▎ | 1472/4476 [9:32:59<19:29:53, 23.37s/it] + 33%|███▎ | 1473/4476 [9:33:22<19:31:46, 23.41s/it] + 33%|███▎ | 1474/4476 [9:33:46<19:31:12, 23.41s/it] + 33%|███▎ | 1475/4476 [9:34:09<19:32:40, 23.45s/it] + 33%|███▎ | 1476/4476 [9:34:33<19:33:37, 23.47s/it] + 33%|███▎ | 1477/4476 [9:34:56<19:34:20, 23.49s/it] + 33%|███▎ | 1478/4476 [9:35:20<19:35:33, 23.53s/it] + 33%|███▎ | 1479/4476 [9:35:43<19:30:49, 23.44s/it] + 33%|███▎ | 1480/4476 [9:36:06<19:28:09, 23.39s/it] + +{'loss': 0.4013, 'learning_rate': 3.768190696395162e-05, 'epoch': 0.99} + + 33%|███▎ | 1480/4476 [9:36:06<19:28:09, 23.39s/it] + 33%|███▎ | 1481/4476 [9:36:30<19:30:21, 23.45s/it] + 33%|███▎ | 1482/4476 [9:36:53<19:30:12, 23.45s/it] + 33%|███▎ | 1483/4476 [9:37:16<19:26:09, 23.38s/it] + 33%|███▎ | 1484/4476 [9:37:39<19:17:04, 23.20s/it] + 33%|███▎ | 1485/4476 [9:38:02<19:15:38, 23.18s/it] + 33%|███▎ | 1486/4476 [9:38:26<19:15:38, 23.19s/it] + 33%|███▎ | 1487/4476 [9:38:49<19:15:50, 23.20s/it] + 33%|███▎ | 1488/4476 [9:39:12<19:19:11, 23.28s/it] + 33%|███▎ | 1489/4476 [9:39:36<19:19:53, 23.30s/it] + 33%|███▎ | 1490/4476 [9:39:59<19:16:33, 23.24s/it] + +{'loss': 0.4028, 'learning_rate': 3.7530379754806494e-05, 'epoch': 1.0} + + 33%|███▎ | 1490/4476 [9:39:59<19:16:33, 23.24s/it] + 33%|███▎ | 1491/4476 [9:40:22<19:16:59, 23.26s/it] + 33%|███▎ | 1492/4476 [9:40:46<19:20:40, 23.34s/it] + 33%|███▎ | 1493/4476 [9:41:09<19:21:24, 23.36s/it] + 33%|███▎ | 1494/4476 [9:41:32<19:20:19, 23.35s/it] + 33%|███▎ | 1495/4476 [9:41:56<19:19:59, 23.35s/it] + 33%|███▎ | 1496/4476 [9:42:19<19:21:08, 23.38s/it] + 33%|███▎ | 1497/4476 [9:42:43<19:22:06, 23.41s/it] + 33%|███▎ | 1498/4476 [9:43:06<19:20:12, 23.38s/it] + 33%|███▎ | 1499/4476 [9:43:29<19:19:28, 23.37s/it] + 34%|███▎ | 1500/4476 [9:43:53<19:18:10, 23.35s/it] + +{'loss': 0.4036, 'learning_rate': 3.737823526589722e-05, 'epoch': 1.01} + + 34%|███▎ | 1500/4476 [9:43:53<19:18:10, 23.35s/it] + 34%|███▎ | 1501/4476 [9:44:16<19:18:27, 23.36s/it] + 34%|███▎ | 1502/4476 [9:44:39<19:17:42, 23.36s/it] + 34%|███▎ | 1503/4476 [9:45:03<19:17:38, 23.36s/it] + 34%|███▎ | 1504/4476 [9:45:26<19:16:17, 23.34s/it] + 34%|███▎ | 1505/4476 [9:45:49<19:14:35, 23.32s/it] + 34%|███▎ | 1506/4476 [9:46:13<19:16:07, 23.36s/it] + 34%|███▎ | 1507/4476 [9:46:36<19:15:55, 23.36s/it] + 34%|███▎ | 1508/4476 [9:46:59<19:14:38, 23.34s/it] + 34%|███▎ | 1509/4476 [9:47:23<19:16:16, 23.38s/it] + 34%|███▎ | 1510/4476 [9:47:47<19:22:06, 23.51s/it] + +{'loss': 0.3937, 'learning_rate': 3.7225480992265125e-05, 'epoch': 1.01} + + 34%|███▎ | 1510/4476 [9:47:47<19:22:06, 23.51s/it] + 34%|███▍ | 1511/4476 [9:48:10<19:17:54, 23.43s/it] + 34%|███▍ | 1512/4476 [9:48:33<19:15:26, 23.39s/it] + 34%|███▍ | 1513/4476 [9:48:56<19:11:25, 23.32s/it] + 34%|███▍ | 1514/4476 [9:49:20<19:12:48, 23.35s/it] + 34%|███▍ | 1515/4476 [9:49:43<19:11:33, 23.33s/it] + 34%|███▍ | 1516/4476 [9:50:06<19:11:05, 23.33s/it] + 34%|███▍ | 1517/4476 [9:50:30<19:09:04, 23.30s/it] + 34%|███▍ | 1518/4476 [9:50:53<19:07:28, 23.28s/it] + 34%|███▍ | 1519/4476 [9:51:16<19:11:18, 23.36s/it] + 34%|███▍ | 1520/4476 [9:51:40<19:09:48, 23.34s/it] + +{'loss': 0.4007, 'learning_rate': 3.707212445899116e-05, 'epoch': 1.02} + + 34%|███▍ | 1520/4476 [9:51:40<19:09:48, 23.34s/it] + 34%|███▍ | 1521/4476 [9:52:03<19:07:59, 23.31s/it] + 34%|███▍ | 1522/4476 [9:52:26<19:07:25, 23.31s/it] + 34%|███▍ | 1523/4476 [9:52:49<19:05:31, 23.28s/it] + 34%|███▍ | 1524/4476 [9:53:13<19:07:39, 23.33s/it] + 34%|███▍ | 1525/4476 [9:53:36<19:09:04, 23.36s/it] + 34%|███▍ | 1526/4476 [9:54:00<19:10:18, 23.40s/it] + 34%|███▍ | 1527/4476 [9:54:23<19:07:30, 23.35s/it] + 34%|███▍ | 1528/4476 [9:54:46<19:07:30, 23.35s/it] + 34%|███▍ | 1529/4476 [9:55:10<19:05:19, 23.32s/it] + 34%|███▍ | 1530/4476 [9:55:33<19:10:56, 23.44s/it] + +{'loss': 0.4004, 'learning_rate': 3.6918173220825204e-05, 'epoch': 1.03} + + 34%|███▍ | 1530/4476 [9:55:33<19:10:56, 23.44s/it] + 34%|███▍ | 1531/4476 [9:55:57<19:14:44, 23.53s/it] + 34%|███▍ | 1532/4476 [9:56:21<19:14:20, 23.53s/it] + 34%|███▍ | 1533/4476 [9:56:44<19:17:41, 23.60s/it] + 34%|███▍ | 1534/4476 [9:57:08<19:17:36, 23.61s/it] + 34%|███▍ | 1535/4476 [9:57:31<19:10:37, 23.47s/it] + 34%|███▍ | 1536/4476 [9:57:54<19:06:12, 23.39s/it] + 34%|███▍ | 1537/4476 [9:58:18<19:07:51, 23.43s/it] + 34%|███▍ | 1538/4476 [9:58:41<19:05:53, 23.40s/it] + 34%|███▍ | 1539/4476 [9:59:04<19:02:25, 23.34s/it] + 34%|███▍ | 1540/4476 [9:59:28<19:05:02, 23.40s/it] + +{'loss': 0.4004, 'learning_rate': 3.6763634861813836e-05, 'epoch': 1.03} + + 34%|███▍ | 1540/4476 [9:59:28<19:05:02, 23.40s/it] + 34%|███▍ | 1541/4476 [9:59:52<19:09:10, 23.49s/it] + 34%|███▍ | 1542/4476 [10:00:15<19:00:03, 23.31s/it] + 34%|███▍ | 1543/4476 [10:00:38<19:02:18, 23.37s/it] + 34%|███▍ | 1544/4476 [10:01:01<19:02:27, 23.38s/it] + 35%|███▍ | 1545/4476 [10:01:25<19:07:18, 23.49s/it] + 35%|███▍ | 1546/4476 [10:01:49<19:06:22, 23.48s/it] + 35%|███▍ | 1547/4476 [10:02:12<19:04:56, 23.45s/it] + 35%|███▍ | 1548/4476 [10:02:35<19:01:54, 23.40s/it] + 35%|███▍ | 1549/4476 [10:02:59<19:00:13, 23.37s/it] + 35%|███▍ | 1550/4476 [10:03:22<18:56:28, 23.30s/it] + +{'loss': 0.3991, 'learning_rate': 3.660851699492679e-05, 'epoch': 1.04} + + 35%|███▍ | 1550/4476 [10:03:22<18:56:28, 23.30s/it] + 35%|███▍ | 1551/4476 [10:03:45<19:00:15, 23.39s/it] + 35%|███▍ | 1552/4476 [10:04:09<18:56:24, 23.32s/it] + 35%|███▍ | 1553/4476 [10:04:32<19:04:22, 23.49s/it] + 35%|███▍ | 1554/4476 [10:04:56<19:04:29, 23.50s/it] + 35%|███▍ | 1555/4476 [10:05:19<19:03:33, 23.49s/it] + 35%|███▍ | 1556/4476 [10:05:43<19:07:20, 23.58s/it] + 35%|███▍ | 1557/4476 [10:06:07<19:09:15, 23.62s/it] + 35%|███▍ | 1558/4476 [10:06:30<19:07:54, 23.60s/it] + 35%|███▍ | 1559/4476 [10:06:54<19:08:40, 23.63s/it] + 35%|███▍ | 1560/4476 [10:07:17<19:03:11, 23.52s/it] + +{'loss': 0.4042, 'learning_rate': 3.645282726168191e-05, 'epoch': 1.05} + + 35%|███▍ | 1560/4476 [10:07:17<19:03:11, 23.52s/it] + 35%|███▍ | 1561/4476 [10:07:41<18:59:13, 23.45s/it] + 35%|███▍ | 1562/4476 [10:08:04<18:54:07, 23.35s/it] + 35%|███▍ | 1563/4476 [10:08:28<18:59:27, 23.47s/it] + 35%|███▍ | 1564/4476 [10:08:51<18:56:37, 23.42s/it] + 35%|███▍ | 1565/4476 [10:09:14<18:51:58, 23.33s/it] + 35%|███▍ | 1566/4476 [10:09:38<18:57:59, 23.46s/it] + 35%|███▌ | 1567/4476 [10:10:01<18:56:05, 23.43s/it] + 35%|███▌ | 1568/4476 [10:10:25<19:01:05, 23.54s/it] + 35%|███▌ | 1569/4476 [10:10:48<18:56:22, 23.45s/it] + 35%|███▌ | 1570/4476 [10:11:12<18:56:15, 23.46s/it] + +{'loss': 0.4043, 'learning_rate': 3.6296573331768664e-05, 'epoch': 1.05} + + 35%|███▌ | 1570/4476 [10:11:12<18:56:15, 23.46s/it] + 35%|███▌ | 1571/4476 [10:11:35<18:57:23, 23.49s/it] + 35%|███▌ | 1572/4476 [10:11:59<19:00:00, 23.55s/it] + 35%|███▌ | 1573/4476 [10:12:22<18:55:54, 23.48s/it] + 35%|███▌ | 1574/4476 [10:12:46<18:59:38, 23.56s/it] + 35%|███▌ | 1575/4476 [10:13:09<18:55:35, 23.49s/it] + 35%|███▌ | 1576/4476 [10:13:33<18:53:51, 23.46s/it] + 35%|███▌ | 1577/4476 [10:13:56<18:50:11, 23.39s/it] + 35%|███▌ | 1578/4476 [10:14:19<18:50:57, 23.42s/it] + 35%|███▌ | 1579/4476 [10:14:42<18:45:11, 23.30s/it] + 35%|███▌ | 1580/4476 [10:15:06<18:43:20, 23.27s/it] + +{'loss': 0.3948, 'learning_rate': 3.613976290267036e-05, 'epoch': 1.06} + + 35%|███▌ | 1580/4476 [10:15:06<18:43:20, 23.27s/it] + 35%|███▌ | 1581/4476 [10:15:29<18:45:31, 23.33s/it] + 35%|███▌ | 1582/4476 [10:15:53<18:47:07, 23.37s/it] + 35%|███▌ | 1583/4476 [10:16:16<18:48:39, 23.41s/it] + 35%|███▌ | 1584/4476 [10:16:39<18:45:54, 23.36s/it] + 35%|███▌ | 1585/4476 [10:17:03<18:46:06, 23.37s/it] + 35%|███▌ | 1586/4476 [10:17:26<18:45:49, 23.37s/it] + 35%|███▌ | 1587/4476 [10:17:50<18:49:35, 23.46s/it] + 35%|███▌ | 1588/4476 [10:18:13<18:49:46, 23.47s/it] + 36%|███▌ | 1589/4476 [10:18:37<18:47:07, 23.42s/it] + 36%|███▌ | 1590/4476 [10:19:00<18:45:27, 23.40s/it] + +{'loss': 0.3952, 'learning_rate': 3.598240369928494e-05, 'epoch': 1.07} + + 36%|███▌ | 1590/4476 [10:19:00<18:45:27, 23.40s/it] + 36%|███▌ | 1591/4476 [10:19:23<18:42:41, 23.35s/it] + 36%|███▌ | 1592/4476 [10:19:47<18:48:03, 23.47s/it] + 36%|███▌ | 1593/4476 [10:20:10<18:47:37, 23.47s/it] + 36%|███▌ | 1594/4476 [10:20:33<18:40:37, 23.33s/it] + 36%|███▌ | 1595/4476 [10:20:57<18:40:58, 23.35s/it] + 36%|███▌ | 1596/4476 [10:21:20<18:36:48, 23.27s/it] + 36%|███▌ | 1597/4476 [10:21:43<18:38:11, 23.30s/it] + 36%|███▌ | 1598/4476 [10:22:06<18:36:11, 23.27s/it] + 36%|███▌ | 1599/4476 [10:22:30<18:36:41, 23.29s/it] + 36%|███▌ | 1600/4476 [10:22:53<18:36:55, 23.30s/it] + +{'loss': 0.4002, 'learning_rate': 3.5824503473544405e-05, 'epoch': 1.07} + + 36%|███▌ | 1600/4476 [10:22:53<18:36:55, 23.30s/it][INFO|trainer.py:2939] 2023-11-12 13:46:40,777 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1600 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 13:46:40,815 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1600/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 13:46:40,816 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1600/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 13:46:40,816 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1600/added_tokens.json + + 36%|███▌ | 1601/4476 [10:23:17<18:40:10, 23.38s/it] + 36%|███▌ | 1602/4476 [10:23:40<18:41:29, 23.41s/it] + 36%|███▌ | 1603/4476 [10:24:03<18:38:34, 23.36s/it] + 36%|███▌ | 1604/4476 [10:24:26<18:33:35, 23.26s/it] + 36%|███▌ | 1605/4476 [10:24:50<18:33:46, 23.28s/it] + 36%|███▌ | 1606/4476 [10:25:13<18:36:14, 23.34s/it] + 36%|███▌ | 1607/4476 [10:25:36<18:34:17, 23.30s/it] + 36%|███▌ | 1608/4476 [10:26:00<18:33:12, 23.29s/it] + 36%|███▌ | 1609/4476 [10:26:23<18:35:22, 23.34s/it] + 36%|███▌ | 1610/4476 [10:26:46<18:32:22, 23.29s/it] + +{'loss': 0.4079, 'learning_rate': 3.566607000403298e-05, 'epoch': 1.08} + + 36%|███▌ | 1610/4476 [10:26:46<18:32:22, 23.29s/it] + 36%|███▌ | 1611/4476 [10:27:10<18:31:42, 23.28s/it] + 36%|███▌ | 1612/4476 [10:27:32<18:26:50, 23.19s/it] + 36%|███▌ | 1613/4476 [10:27:56<18:31:03, 23.28s/it] + 36%|███▌ | 1614/4476 [10:28:19<18:30:54, 23.29s/it] + 36%|███▌ | 1615/4476 [10:28:43<18:36:30, 23.42s/it] + 36%|███▌ | 1616/4476 [10:29:06<18:35:53, 23.41s/it] + 36%|███▌ | 1617/4476 [10:29:30<18:36:56, 23.44s/it] + 36%|███▌ | 1618/4476 [10:29:53<18:35:17, 23.41s/it] + 36%|███▌ | 1619/4476 [10:30:17<18:32:47, 23.37s/it] + 36%|███▌ | 1620/4476 [10:30:40<18:29:18, 23.30s/it] + +{'loss': 0.3942, 'learning_rate': 3.5523030408223166e-05, 'epoch': 1.09} + + 36%|███▌ | 1620/4476 [10:30:40<18:29:18, 23.30s/it] + 36%|███▌ | 1621/4476 [10:31:03<18:24:36, 23.21s/it] + 36%|███▌ | 1622/4476 [10:31:26<18:27:19, 23.28s/it] + 36%|███▋ | 1623/4476 [10:31:50<18:32:36, 23.40s/it] + 36%|███▋ | 1624/4476 [10:32:13<18:30:15, 23.36s/it] + 36%|███▋ | 1625/4476 [10:32:37<18:33:42, 23.44s/it] + 36%|███▋ | 1626/4476 [10:33:00<18:33:39, 23.45s/it] + 36%|███▋ | 1627/4476 [10:33:23<18:23:53, 23.25s/it] + 36%|███▋ | 1628/4476 [10:33:46<18:21:21, 23.20s/it] + 36%|███▋ | 1629/4476 [10:34:09<18:24:32, 23.28s/it] + 36%|███▋ | 1630/4476 [10:34:33<18:24:21, 23.28s/it] + +{'loss': 0.3935, 'learning_rate': 3.5363605299319165e-05, 'epoch': 1.09} + + 36%|███▋ | 1630/4476 [10:34:33<18:24:21, 23.28s/it] + 36%|███▋ | 1631/4476 [10:34:56<18:23:37, 23.28s/it] + 36%|███▋ | 1632/4476 [10:35:20<18:29:09, 23.40s/it] + 36%|███▋ | 1633/4476 [10:35:43<18:25:15, 23.33s/it] + 37%|███▋ | 1634/4476 [10:36:06<18:27:32, 23.38s/it] + 37%|███▋ | 1635/4476 [10:36:30<18:25:34, 23.35s/it] + 37%|███▋ | 1636/4476 [10:36:53<18:26:44, 23.38s/it] + 37%|███▋ | 1637/4476 [10:37:16<18:19:19, 23.23s/it] + 37%|███▋ | 1638/4476 [10:37:39<18:21:19, 23.28s/it] + 37%|███▋ | 1639/4476 [10:38:03<18:21:19, 23.29s/it] + 37%|███▋ | 1640/4476 [10:38:26<18:21:04, 23.29s/it] + +{'loss': 0.3898, 'learning_rate': 3.520366965171161e-05, 'epoch': 1.1} + + 37%|███▋ | 1640/4476 [10:38:26<18:21:04, 23.29s/it] + 37%|███▋ | 1641/4476 [10:38:50<18:27:00, 23.43s/it] + 37%|███▋ | 1642/4476 [10:39:13<18:22:51, 23.35s/it] + 37%|███▋ | 1643/4476 [10:39:36<18:25:02, 23.40s/it] + 37%|███▋ | 1644/4476 [10:40:00<18:22:31, 23.36s/it] + 37%|███▋ | 1645/4476 [10:40:23<18:25:35, 23.43s/it] + 37%|███▋ | 1646/4476 [10:40:47<18:26:32, 23.46s/it] + 37%|███▋ | 1647/4476 [10:41:10<18:27:58, 23.50s/it] + 37%|███▋ | 1648/4476 [10:41:34<18:29:05, 23.53s/it] + 37%|███▋ | 1649/4476 [10:41:57<18:24:46, 23.45s/it] + 37%|███▋ | 1650/4476 [10:42:21<18:24:17, 23.45s/it] + +{'loss': 0.4006, 'learning_rate': 3.504323134425501e-05, 'epoch': 1.11} + + 37%|███▋ | 1650/4476 [10:42:21<18:24:17, 23.45s/it] + 37%|███▋ | 1651/4476 [10:42:44<18:21:26, 23.39s/it] + 37%|███▋ | 1652/4476 [10:43:08<18:22:38, 23.43s/it] + 37%|███▋ | 1653/4476 [10:43:31<18:21:10, 23.40s/it] + 37%|███▋ | 1654/4476 [10:43:54<18:19:24, 23.38s/it] + 37%|███▋ | 1655/4476 [10:44:18<18:23:29, 23.47s/it] + 37%|███▋ | 1656/4476 [10:44:41<18:20:00, 23.40s/it] + 37%|███▋ | 1657/4476 [10:45:04<18:15:34, 23.32s/it] + 37%|███▋ | 1658/4476 [10:45:28<18:20:32, 23.43s/it] + 37%|███▋ | 1659/4476 [10:45:51<18:19:17, 23.41s/it] + 37%|███▋ | 1660/4476 [10:46:15<18:23:22, 23.51s/it] + +{'loss': 0.4089, 'learning_rate': 3.48822982805662e-05, 'epoch': 1.11} + + 37%|███▋ | 1660/4476 [10:46:15<18:23:22, 23.51s/it] + 37%|███▋ | 1661/4476 [10:46:38<18:20:28, 23.46s/it] + 37%|███▋ | 1662/4476 [10:47:02<18:17:50, 23.41s/it] + 37%|███▋ | 1663/4476 [10:47:25<18:16:06, 23.38s/it] + 37%|███▋ | 1664/4476 [10:47:48<18:12:37, 23.31s/it] + 37%|███▋ | 1665/4476 [10:48:11<18:09:29, 23.25s/it] + 37%|███▋ | 1666/4476 [10:48:35<18:11:28, 23.31s/it] + 37%|███▋ | 1667/4476 [10:48:58<18:06:53, 23.22s/it] + 37%|███▋ | 1668/4476 [10:49:21<18:07:21, 23.23s/it] + 37%|███▋ | 1669/4476 [10:49:44<18:08:29, 23.27s/it] + 37%|███▋ | 1670/4476 [10:50:08<18:12:01, 23.35s/it] + +{'loss': 0.3982, 'learning_rate': 3.472087838863505e-05, 'epoch': 1.12} + + 37%|███▋ | 1670/4476 [10:50:08<18:12:01, 23.35s/it] + 37%|███▋ | 1671/4476 [10:50:31<18:12:39, 23.37s/it] + 37%|███▋ | 1672/4476 [10:50:55<18:12:45, 23.38s/it] + 37%|███▋ | 1673/4476 [10:51:18<18:10:22, 23.34s/it] + 37%|███▋ | 1674/4476 [10:51:41<18:12:44, 23.40s/it] + 37%|███▋ | 1675/4476 [10:52:05<18:13:06, 23.42s/it] + 37%|███▋ | 1676/4476 [10:52:28<18:11:31, 23.39s/it] + 37%|███▋ | 1677/4476 [10:52:51<18:06:47, 23.30s/it] + 37%|███▋ | 1678/4476 [10:53:15<18:09:24, 23.36s/it] + 38%|███▊ | 1679/4476 [10:53:38<18:08:40, 23.35s/it] + 38%|███▊ | 1680/4476 [10:54:01<18:02:21, 23.23s/it] + +{'loss': 0.399, 'learning_rate': 3.455897962043387e-05, 'epoch': 1.13} + + 38%|███▊ | 1680/4476 [10:54:01<18:02:21, 23.23s/it] + 38%|███▊ | 1681/4476 [10:54:24<18:00:21, 23.19s/it] + 38%|███▊ | 1682/4476 [10:54:48<18:08:41, 23.38s/it] + 38%|███▊ | 1683/4476 [10:55:11<18:09:07, 23.40s/it] + 38%|███▊ | 1684/4476 [10:55:35<18:10:00, 23.42s/it] + 38%|███▊ | 1685/4476 [10:55:58<18:06:46, 23.36s/it] + 38%|███▊ | 1686/4476 [10:56:21<17:59:35, 23.22s/it] + 38%|███▊ | 1687/4476 [10:56:44<18:00:31, 23.25s/it] + 38%|███▊ | 1688/4476 [10:57:08<17:59:46, 23.24s/it] + 38%|███▊ | 1689/4476 [10:57:31<18:00:33, 23.26s/it] + 38%|███▊ | 1690/4476 [10:57:54<18:01:35, 23.29s/it] + +{'loss': 0.3964, 'learning_rate': 3.4396609951525676e-05, 'epoch': 1.13} + + 38%|███▊ | 1690/4476 [10:57:54<18:01:35, 23.29s/it] + 38%|███▊ | 1691/4476 [10:58:17<17:59:29, 23.26s/it] + 38%|███▊ | 1692/4476 [10:58:41<18:00:08, 23.28s/it] + 38%|███▊ | 1693/4476 [10:59:04<17:59:20, 23.27s/it] + 38%|███▊ | 1694/4476 [10:59:27<18:01:23, 23.32s/it] + 38%|███▊ | 1695/4476 [10:59:51<18:03:34, 23.38s/it] + 38%|███▊ | 1696/4476 [11:00:14<18:01:46, 23.35s/it] + 38%|███▊ | 1697/4476 [11:00:38<18:02:31, 23.37s/it] + 38%|███▊ | 1698/4476 [11:01:00<17:54:11, 23.20s/it] + 38%|███▊ | 1699/4476 [11:01:24<17:57:36, 23.28s/it] + 38%|███▊ | 1700/4476 [11:01:47<17:56:40, 23.27s/it] + +{'loss': 0.3909, 'learning_rate': 3.423377738067132e-05, 'epoch': 1.14} + + 38%|███▊ | 1700/4476 [11:01:47<17:56:40, 23.27s/it] + 38%|███▊ | 1701/4476 [11:02:10<17:52:23, 23.19s/it] + 38%|███▊ | 1702/4476 [11:02:34<17:55:38, 23.27s/it] + 38%|███▊ | 1703/4476 [11:02:57<17:57:29, 23.31s/it] + 38%|███▊ | 1704/4476 [11:03:20<17:56:40, 23.30s/it] + 38%|███▊ | 1705/4476 [11:03:44<17:56:37, 23.31s/it] + 38%|███▊ | 1706/4476 [11:04:07<17:56:50, 23.33s/it] + 38%|███▊ | 1707/4476 [11:04:30<17:56:51, 23.33s/it] + 38%|███▊ | 1708/4476 [11:04:54<17:57:59, 23.37s/it] + 38%|███▊ | 1709/4476 [11:05:18<18:02:52, 23.48s/it] + 38%|███▊ | 1710/4476 [11:05:41<18:01:31, 23.46s/it] + +{'loss': 0.4015, 'learning_rate': 3.407048992943541e-05, 'epoch': 1.15} + + 38%|███▊ | 1710/4476 [11:05:41<18:01:31, 23.46s/it] + 38%|███▊ | 1711/4476 [11:06:04<17:54:44, 23.32s/it] + 38%|███▊ | 1712/4476 [11:06:28<17:57:02, 23.38s/it] + 38%|███▊ | 1713/4476 [11:06:51<17:58:53, 23.43s/it] + 38%|███▊ | 1714/4476 [11:07:15<18:00:41, 23.48s/it] + 38%|███▊ | 1715/4476 [11:07:38<18:01:00, 23.49s/it] + 38%|███▊ | 1716/4476 [11:08:01<17:57:31, 23.42s/it] + 38%|███▊ | 1717/4476 [11:08:25<17:52:56, 23.33s/it] + 38%|███▊ | 1718/4476 [11:08:48<17:51:18, 23.31s/it] + 38%|███▊ | 1719/4476 [11:09:11<17:48:55, 23.26s/it] + 38%|███▊ | 1720/4476 [11:09:34<17:46:21, 23.22s/it] + +{'loss': 0.3915, 'learning_rate': 3.39067556417912e-05, 'epoch': 1.15} + + 38%|███▊ | 1720/4476 [11:09:34<17:46:21, 23.22s/it] + 38%|███▊ | 1721/4476 [11:09:58<17:49:21, 23.29s/it] + 38%|███▊ | 1722/4476 [11:10:21<17:51:23, 23.34s/it] + 38%|███▊ | 1723/4476 [11:10:44<17:52:43, 23.38s/it] + 39%|███▊ | 1724/4476 [11:11:08<17:51:06, 23.35s/it] + 39%|███▊ | 1725/4476 [11:11:31<17:52:40, 23.40s/it] + 39%|███▊ | 1726/4476 [11:11:54<17:48:36, 23.32s/it] + 39%|███▊ | 1727/4476 [11:12:18<17:46:14, 23.27s/it] + 39%|███▊ | 1728/4476 [11:12:41<17:46:01, 23.28s/it] + 39%|███▊ | 1729/4476 [11:13:04<17:44:31, 23.25s/it] + 39%|███▊ | 1730/4476 [11:13:27<17:46:13, 23.30s/it] + +{'loss': 0.3845, 'learning_rate': 3.374258258372426e-05, 'epoch': 1.16} + + 39%|███▊ | 1730/4476 [11:13:27<17:46:13, 23.30s/it] + 39%|███▊ | 1731/4476 [11:13:51<17:47:43, 23.34s/it] + 39%|███▊ | 1732/4476 [11:14:14<17:47:19, 23.34s/it] + 39%|███▊ | 1733/4476 [11:14:38<17:49:40, 23.40s/it] + 39%|███▊ | 1734/4476 [11:15:01<17:44:03, 23.28s/it] + 39%|███▉ | 1735/4476 [11:15:24<17:40:01, 23.20s/it] + 39%|███▉ | 1736/4476 [11:15:47<17:39:56, 23.21s/it] + 39%|███▉ | 1737/4476 [11:16:10<17:41:29, 23.25s/it] + 39%|███▉ | 1738/4476 [11:16:34<17:47:39, 23.40s/it] + 39%|███▉ | 1739/4476 [11:16:57<17:45:46, 23.36s/it] + 39%|███▉ | 1740/4476 [11:17:21<17:48:27, 23.43s/it] + +{'loss': 0.4018, 'learning_rate': 3.357797884283517e-05, 'epoch': 1.17} + + 39%|███▉ | 1740/4476 [11:17:21<17:48:27, 23.43s/it] + 39%|███▉ | 1741/4476 [11:17:44<17:42:02, 23.30s/it] + 39%|███▉ | 1742/4476 [11:18:07<17:42:48, 23.32s/it] + 39%|███▉ | 1743/4476 [11:18:31<17:43:13, 23.34s/it] + 39%|███▉ | 1744/4476 [11:18:54<17:43:26, 23.36s/it] + 39%|███▉ | 1745/4476 [11:19:18<17:45:09, 23.40s/it] + 39%|███▉ | 1746/4476 [11:19:41<17:39:42, 23.29s/it] + 39%|███▉ | 1747/4476 [11:20:04<17:42:15, 23.35s/it] + 39%|███▉ | 1748/4476 [11:20:28<17:43:35, 23.39s/it] + 39%|███▉ | 1749/4476 [11:20:51<17:45:15, 23.44s/it] + 39%|███▉ | 1750/4476 [11:21:14<17:40:10, 23.33s/it] + +{'loss': 0.3914, 'learning_rate': 3.3412952527941096e-05, 'epoch': 1.17} + + 39%|███▉ | 1750/4476 [11:21:14<17:40:10, 23.33s/it] + 39%|███▉ | 1751/4476 [11:21:38<17:40:19, 23.35s/it] + 39%|███▉ | 1752/4476 [11:22:01<17:38:10, 23.31s/it] + 39%|███▉ | 1753/4476 [11:22:24<17:39:49, 23.35s/it] + 39%|███▉ | 1754/4476 [11:22:48<17:38:27, 23.33s/it] + 39%|███▉ | 1755/4476 [11:23:11<17:41:30, 23.41s/it] + 39%|███▉ | 1756/4476 [11:23:34<17:39:29, 23.37s/it] + 39%|███▉ | 1757/4476 [11:23:58<17:40:20, 23.40s/it] + 39%|███▉ | 1758/4476 [11:24:21<17:36:53, 23.33s/it] + 39%|███▉ | 1759/4476 [11:24:44<17:35:26, 23.31s/it] + 39%|███▉ | 1760/4476 [11:25:07<17:28:15, 23.16s/it] + +{'loss': 0.3909, 'learning_rate': 3.32475117686763e-05, 'epoch': 1.18} + + 39%|███▉ | 1760/4476 [11:25:07<17:28:15, 23.16s/it] + 39%|███▉ | 1761/4476 [11:25:31<17:30:23, 23.21s/it] + 39%|███▉ | 1762/4476 [11:25:53<17:26:17, 23.13s/it] + 39%|███▉ | 1763/4476 [11:26:17<17:27:27, 23.17s/it] + 39%|███▉ | 1764/4476 [11:26:40<17:27:40, 23.18s/it] + 39%|███▉ | 1765/4476 [11:27:03<17:29:29, 23.23s/it] + 39%|███▉ | 1766/4476 [11:27:26<17:29:24, 23.23s/it] + 39%|███▉ | 1767/4476 [11:27:50<17:34:13, 23.35s/it] + 39%|███▉ | 1768/4476 [11:28:13<17:32:21, 23.32s/it] + 40%|███▉ | 1769/4476 [11:28:36<17:29:18, 23.26s/it] + 40%|███▉ | 1770/4476 [11:29:00<17:27:44, 23.23s/it] + +{'loss': 0.3993, 'learning_rate': 3.308166471509171e-05, 'epoch': 1.19} + + 40%|███▉ | 1770/4476 [11:29:00<17:27:44, 23.23s/it] + 40%|███▉ | 1771/4476 [11:29:23<17:29:27, 23.28s/it] + 40%|███▉ | 1772/4476 [11:29:46<17:30:33, 23.31s/it] + 40%|███▉ | 1773/4476 [11:30:10<17:29:11, 23.29s/it] + 40%|███▉ | 1774/4476 [11:30:33<17:25:45, 23.22s/it] + 40%|███▉ | 1775/4476 [11:30:56<17:31:47, 23.36s/it] + 40%|███▉ | 1776/4476 [11:31:20<17:30:52, 23.35s/it] + 40%|███▉ | 1777/4476 [11:31:43<17:27:33, 23.29s/it] + 40%|███▉ | 1778/4476 [11:32:06<17:28:37, 23.32s/it] + 40%|███▉ | 1779/4476 [11:32:30<17:33:24, 23.44s/it] + 40%|███▉ | 1780/4476 [11:32:53<17:30:10, 23.37s/it] + +{'loss': 0.3906, 'learning_rate': 3.2915419537253346e-05, 'epoch': 1.19} + + 40%|███▉ | 1780/4476 [11:32:53<17:30:10, 23.37s/it] + 40%|███▉ | 1781/4476 [11:33:16<17:26:04, 23.29s/it] + 40%|███▉ | 1782/4476 [11:33:40<17:28:23, 23.35s/it] + 40%|███▉ | 1783/4476 [11:34:03<17:27:11, 23.33s/it] + 40%|███▉ | 1784/4476 [11:34:26<17:26:38, 23.33s/it] + 40%|███▉ | 1785/4476 [11:34:50<17:30:44, 23.43s/it] + 40%|███▉ | 1786/4476 [11:35:14<17:31:04, 23.44s/it] + 40%|███▉ | 1787/4476 [11:35:37<17:32:07, 23.48s/it] + 40%|███▉ | 1788/4476 [11:36:01<17:31:02, 23.46s/it] + 40%|███▉ | 1789/4476 [11:36:24<17:28:21, 23.41s/it] + 40%|███▉ | 1790/4476 [11:36:47<17:26:13, 23.37s/it] + +{'loss': 0.3897, 'learning_rate': 3.274878442483991e-05, 'epoch': 1.2} + + 40%|███▉ | 1790/4476 [11:36:47<17:26:13, 23.37s/it] + 40%|████ | 1791/4476 [11:37:11<17:27:32, 23.41s/it] + 40%|████ | 1792/4476 [11:37:34<17:24:34, 23.35s/it] + 40%|████ | 1793/4476 [11:37:58<17:28:58, 23.46s/it] + 40%|████ | 1794/4476 [11:38:21<17:31:29, 23.52s/it] + 40%|████ | 1795/4476 [11:38:44<17:27:29, 23.44s/it] + 40%|████ | 1796/4476 [11:39:08<17:27:24, 23.45s/it] + 40%|████ | 1797/4476 [11:39:31<17:24:26, 23.39s/it] + 40%|████ | 1798/4476 [11:39:54<17:22:25, 23.36s/it] + 40%|████ | 1799/4476 [11:40:18<17:22:08, 23.36s/it] + 40%|████ | 1800/4476 [11:40:41<17:20:11, 23.32s/it] + +{'loss': 0.3954, 'learning_rate': 3.258176758673932e-05, 'epoch': 1.21} + + 40%|████ | 1800/4476 [11:40:41<17:20:11, 23.32s/it][INFO|trainer.py:2939] 2023-11-12 15:04:28,764 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1800 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 15:04:28,795 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1800/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 15:04:28,795 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1800/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 15:04:28,795 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-1800/added_tokens.json + + 40%|████ | 1801/4476 [11:41:05<17:25:13, 23.44s/it] + 40%|████ | 1802/4476 [11:41:29<17:29:02, 23.54s/it] + 40%|████ | 1803/4476 [11:41:52<17:30:23, 23.58s/it] + 40%|████ | 1804/4476 [11:42:16<17:26:29, 23.50s/it] + 40%|████ | 1805/4476 [11:42:39<17:26:11, 23.50s/it] + 40%|████ | 1806/4476 [11:43:02<17:21:55, 23.41s/it] + 40%|████ | 1807/4476 [11:43:25<17:14:44, 23.26s/it] + 40%|████ | 1808/4476 [11:43:48<17:13:13, 23.24s/it] + 40%|████ | 1809/4476 [11:44:12<17:14:13, 23.27s/it] + 40%|████ | 1810/4476 [11:44:35<17:09:14, 23.16s/it] + +{'loss': 0.3835, 'learning_rate': 3.241437725064431e-05, 'epoch': 1.21} + + 40%|████ | 1810/4476 [11:44:35<17:09:14, 23.16s/it] + 40%|████ | 1811/4476 [11:44:58<17:13:46, 23.27s/it] + 40%|████ | 1812/4476 [11:45:22<17:17:24, 23.37s/it] + 41%|████ | 1813/4476 [11:45:45<17:15:31, 23.33s/it] + 41%|████ | 1814/4476 [11:46:08<17:17:38, 23.39s/it] + 41%|████ | 1815/4476 [11:46:32<17:18:40, 23.42s/it] + 41%|████ | 1816/4476 [11:46:55<17:16:29, 23.38s/it] + 41%|████ | 1817/4476 [11:47:18<17:13:21, 23.32s/it] + 41%|████ | 1818/4476 [11:47:42<17:17:58, 23.43s/it] + 41%|████ | 1819/4476 [11:48:06<17:18:10, 23.44s/it] + 41%|████ | 1820/4476 [11:48:29<17:16:22, 23.41s/it] + +{'loss': 0.3854, 'learning_rate': 3.224662166264711e-05, 'epoch': 1.22} + + 41%|████ | 1820/4476 [11:48:29<17:16:22, 23.41s/it] + 41%|████ | 1821/4476 [11:48:52<17:17:06, 23.44s/it] + 41%|████ | 1822/4476 [11:49:16<17:18:47, 23.48s/it] + 41%|████ | 1823/4476 [11:49:39<17:15:32, 23.42s/it] + 41%|████ | 1824/4476 [11:50:03<17:12:44, 23.37s/it] + 41%|████ | 1825/4476 [11:50:26<17:16:14, 23.45s/it] + 41%|████ | 1826/4476 [11:50:49<17:12:34, 23.38s/it] + 41%|████ | 1827/4476 [11:51:13<17:10:51, 23.35s/it] + 41%|████ | 1828/4476 [11:51:36<17:11:43, 23.38s/it] + 41%|████ | 1829/4476 [11:51:59<17:08:36, 23.32s/it] + 41%|████ | 1830/4476 [11:52:22<17:02:01, 23.17s/it] + +{'loss': 0.3924, 'learning_rate': 3.207850908683322e-05, 'epoch': 1.23} + + 41%|████ | 1830/4476 [11:52:22<17:02:01, 23.17s/it] + 41%|████ | 1831/4476 [11:52:45<17:03:01, 23.21s/it] + 41%|████ | 1832/4476 [11:53:09<17:10:50, 23.39s/it] + 41%|████ | 1833/4476 [11:53:32<17:04:46, 23.26s/it] + 41%|████ | 1834/4476 [11:53:56<17:10:19, 23.40s/it] + 41%|████ | 1835/4476 [11:54:19<17:07:13, 23.34s/it] + 41%|████ | 1836/4476 [11:54:42<17:07:17, 23.35s/it] + 41%|████ | 1837/4476 [11:55:06<17:04:34, 23.29s/it] + 41%|████ | 1838/4476 [11:55:29<17:08:34, 23.39s/it] + 41%|████ | 1839/4476 [11:55:53<17:10:11, 23.44s/it] + 41%|████ | 1840/4476 [11:56:16<17:07:14, 23.38s/it] + +{'loss': 0.3888, 'learning_rate': 3.191004780487434e-05, 'epoch': 1.23} + + 41%|████ | 1840/4476 [11:56:16<17:07:14, 23.38s/it] + 41%|████ | 1841/4476 [11:56:39<17:07:22, 23.39s/it] + 41%|████ | 1842/4476 [11:57:03<17:05:53, 23.37s/it] + 41%|████ | 1843/4476 [11:57:26<17:08:17, 23.43s/it] + 41%|████ | 1844/4476 [11:57:50<17:05:41, 23.38s/it] + 41%|████ | 1845/4476 [11:58:13<17:00:24, 23.27s/it] + 41%|████ | 1846/4476 [11:58:36<16:59:39, 23.26s/it] + 41%|████▏ | 1847/4476 [11:59:00<17:06:09, 23.42s/it] + 41%|████▏ | 1848/4476 [11:59:23<17:10:08, 23.52s/it] + 41%|████▏ | 1849/4476 [11:59:47<17:08:57, 23.50s/it] + 41%|████▏ | 1850/4476 [12:00:10<17:02:59, 23.37s/it] + +{'loss': 0.3914, 'learning_rate': 3.1741246115620336e-05, 'epoch': 1.24} + + 41%|████▏ | 1850/4476 [12:00:10<17:02:59, 23.37s/it] + 41%|████▏ | 1851/4476 [12:00:34<17:08:17, 23.50s/it] + 41%|████▏ | 1852/4476 [12:00:57<17:06:33, 23.47s/it] + 41%|████▏ | 1853/4476 [12:01:21<17:06:23, 23.48s/it] + 41%|████▏ | 1854/4476 [12:01:44<17:06:58, 23.50s/it] + 41%|████▏ | 1855/4476 [12:02:08<17:04:03, 23.44s/it] + 41%|████▏ | 1856/4476 [12:02:31<17:01:08, 23.38s/it] + 41%|████▏ | 1857/4476 [12:02:54<17:01:46, 23.41s/it] + 42%|████▏ | 1858/4476 [12:03:18<17:02:38, 23.44s/it] + 42%|████▏ | 1859/4476 [12:03:41<17:03:51, 23.47s/it] + 42%|████▏ | 1860/4476 [12:04:05<17:01:48, 23.44s/it] + +{'loss': 0.391, 'learning_rate': 3.157211233469042e-05, 'epoch': 1.25} + + 42%|████▏ | 1860/4476 [12:04:05<17:01:48, 23.44s/it] + 42%|████▏ | 1861/4476 [12:04:28<17:02:43, 23.47s/it] + 42%|████▏ | 1862/4476 [12:04:52<17:00:47, 23.43s/it] + 42%|████▏ | 1863/4476 [12:05:15<16:59:22, 23.41s/it] + 42%|████▏ | 1864/4476 [12:05:38<16:57:29, 23.37s/it] + 42%|████▏ | 1865/4476 [12:06:02<16:58:58, 23.42s/it] + 42%|████▏ | 1866/4476 [12:06:25<16:53:12, 23.29s/it] + 42%|████▏ | 1867/4476 [12:06:48<16:50:36, 23.24s/it] + 42%|████▏ | 1868/4476 [12:07:11<16:52:04, 23.28s/it] + 42%|████▏ | 1869/4476 [12:07:34<16:50:41, 23.26s/it] + 42%|████▏ | 1870/4476 [12:07:58<16:56:24, 23.40s/it] + +{'loss': 0.3916, 'learning_rate': 3.140265479406358e-05, 'epoch': 1.25} + + 42%|████▏ | 1870/4476 [12:07:58<16:56:24, 23.40s/it] + 42%|████▏ | 1871/4476 [12:08:21<16:52:50, 23.33s/it] + 42%|████▏ | 1872/4476 [12:08:44<16:50:47, 23.29s/it] + 42%|████▏ | 1873/4476 [12:09:08<16:48:55, 23.26s/it] + 42%|████▏ | 1874/4476 [12:09:31<16:48:30, 23.26s/it] + 42%|████▏ | 1875/4476 [12:09:54<16:49:21, 23.28s/it] + 42%|████▏ | 1876/4476 [12:10:17<16:46:24, 23.22s/it] + 42%|████▏ | 1877/4476 [12:10:40<16:44:57, 23.20s/it] + 42%|████▏ | 1878/4476 [12:11:04<16:45:17, 23.22s/it] + 42%|████▏ | 1879/4476 [12:11:27<16:46:59, 23.27s/it] + 42%|████▏ | 1880/4476 [12:11:51<16:50:17, 23.35s/it] + +{'loss': 0.4012, 'learning_rate': 3.1232881841668015e-05, 'epoch': 1.26} + + 42%|████▏ | 1880/4476 [12:11:51<16:50:17, 23.35s/it] + 42%|████▏ | 1881/4476 [12:12:14<16:50:13, 23.36s/it] + 42%|████▏ | 1882/4476 [12:12:37<16:48:03, 23.32s/it] + 42%|████▏ | 1883/4476 [12:13:00<16:44:24, 23.24s/it] + 42%|████▏ | 1884/4476 [12:13:23<16:40:26, 23.16s/it] + 42%|████▏ | 1885/4476 [12:13:47<16:43:53, 23.25s/it] + 42%|████▏ | 1886/4476 [12:14:10<16:48:28, 23.36s/it] + 42%|████▏ | 1887/4476 [12:14:34<16:47:24, 23.35s/it] + 42%|████▏ | 1888/4476 [12:14:57<16:43:47, 23.27s/it] + 42%|████▏ | 1889/4476 [12:15:20<16:39:49, 23.19s/it] + 42%|████▏ | 1890/4476 [12:15:43<16:40:41, 23.22s/it] + +{'loss': 0.3934, 'learning_rate': 3.106280184096996e-05, 'epoch': 1.27} + + 42%|████▏ | 1890/4476 [12:15:43<16:40:41, 23.22s/it] + 42%|████▏ | 1891/4476 [12:16:06<16:40:28, 23.22s/it] + 42%|████▏ | 1892/4476 [12:16:29<16:38:21, 23.18s/it] + 42%|████▏ | 1893/4476 [12:16:53<16:45:18, 23.35s/it] + 42%|████▏ | 1894/4476 [12:17:17<16:50:00, 23.47s/it] + 42%|████▏ | 1895/4476 [12:17:40<16:44:39, 23.36s/it] + 42%|████▏ | 1896/4476 [12:18:03<16:44:30, 23.36s/it] + 42%|████▏ | 1897/4476 [12:18:27<16:44:40, 23.37s/it] + 42%|████▏ | 1898/4476 [12:18:50<16:44:17, 23.37s/it] + 42%|████▏ | 1899/4476 [12:19:14<16:44:24, 23.39s/it] + 42%|████▏ | 1900/4476 [12:19:37<16:45:15, 23.41s/it] + +{'loss': 0.3908, 'learning_rate': 3.089242317056168e-05, 'epoch': 1.27} + + 42%|████▏ | 1900/4476 [12:19:37<16:45:15, 23.41s/it] + 42%|████▏ | 1901/4476 [12:20:00<16:41:18, 23.33s/it] + 42%|████▏ | 1902/4476 [12:20:23<16:40:56, 23.33s/it] + 43%|████▎ | 1903/4476 [12:20:47<16:44:54, 23.43s/it] + 43%|████▎ | 1904/4476 [12:21:10<16:37:35, 23.27s/it] + 43%|████▎ | 1905/4476 [12:21:33<16:36:34, 23.26s/it] + 43%|████▎ | 1906/4476 [12:21:57<16:36:43, 23.27s/it] + 43%|████▎ | 1907/4476 [12:22:20<16:36:50, 23.28s/it] + 43%|████▎ | 1908/4476 [12:22:44<16:42:38, 23.43s/it] + 43%|████▎ | 1909/4476 [12:23:07<16:42:35, 23.43s/it] + 43%|████▎ | 1910/4476 [12:23:30<16:41:18, 23.41s/it] + +{'loss': 0.3972, 'learning_rate': 3.072175422374867e-05, 'epoch': 1.28} + + 43%|████▎ | 1910/4476 [12:23:30<16:41:18, 23.41s/it] + 43%|████▎ | 1911/4476 [12:23:54<16:38:28, 23.36s/it] + 43%|████▎ | 1912/4476 [12:24:17<16:40:01, 23.40s/it] + 43%|████▎ | 1913/4476 [12:24:41<16:41:15, 23.44s/it] + 43%|████▎ | 1914/4476 [12:25:04<16:43:23, 23.50s/it] + 43%|████▎ | 1915/4476 [12:25:28<16:42:32, 23.49s/it] + 43%|████▎ | 1916/4476 [12:25:51<16:42:15, 23.49s/it] + 43%|████▎ | 1917/4476 [12:26:15<16:38:18, 23.41s/it] + 43%|████▎ | 1918/4476 [12:26:38<16:43:03, 23.53s/it] + 43%|████▎ | 1919/4476 [12:27:02<16:43:30, 23.55s/it] + 43%|████▎ | 1920/4476 [12:27:25<16:40:27, 23.48s/it] + +{'loss': 0.3963, 'learning_rate': 3.055080340813623e-05, 'epoch': 1.29} + + 43%|████▎ | 1920/4476 [12:27:25<16:40:27, 23.48s/it] + 43%|████▎ | 1921/4476 [12:27:49<16:41:10, 23.51s/it] + 43%|████▎ | 1922/4476 [12:28:12<16:38:31, 23.46s/it] + 43%|████▎ | 1923/4476 [12:28:36<16:36:16, 23.41s/it] + 43%|████▎ | 1924/4476 [12:28:59<16:33:49, 23.37s/it] + 43%|████▎ | 1925/4476 [12:29:22<16:30:01, 23.29s/it] + 43%|████▎ | 1926/4476 [12:29:45<16:34:12, 23.39s/it] + 43%|████▎ | 1927/4476 [12:30:09<16:37:24, 23.48s/it] + 43%|████▎ | 1928/4476 [12:30:32<16:33:47, 23.40s/it] + 43%|████▎ | 1929/4476 [12:30:56<16:37:09, 23.49s/it] + 43%|████▎ | 1930/4476 [12:31:20<16:36:48, 23.49s/it] + +{'loss': 0.3941, 'learning_rate': 3.0379579145215287e-05, 'epoch': 1.29} + + 43%|████▎ | 1930/4476 [12:31:20<16:36:48, 23.49s/it] + 43%|████▎ | 1931/4476 [12:31:43<16:31:15, 23.37s/it] + 43%|████▎ | 1932/4476 [12:32:06<16:29:30, 23.34s/it] + 43%|████▎ | 1933/4476 [12:32:29<16:31:40, 23.40s/it] + 43%|████▎ | 1934/4476 [12:32:53<16:32:22, 23.42s/it] + 43%|████▎ | 1935/4476 [12:33:16<16:29:03, 23.35s/it] + 43%|████▎ | 1936/4476 [12:33:39<16:26:39, 23.31s/it] + 43%|████▎ | 1937/4476 [12:34:03<16:24:57, 23.28s/it] + 43%|████▎ | 1938/4476 [12:34:26<16:27:45, 23.35s/it] + 43%|████▎ | 1939/4476 [12:34:50<16:28:27, 23.38s/it] + 43%|████▎ | 1940/4476 [12:35:13<16:27:42, 23.37s/it] + +{'loss': 0.3887, 'learning_rate': 3.0208089869947475e-05, 'epoch': 1.3} + + 43%|████▎ | 1940/4476 [12:35:13<16:27:42, 23.37s/it] + 43%|████▎ | 1941/4476 [12:35:36<16:26:01, 23.34s/it] + 43%|████▎ | 1942/4476 [12:35:59<16:23:01, 23.28s/it] + 43%|████▎ | 1943/4476 [12:36:22<16:20:02, 23.21s/it] + 43%|████▎ | 1944/4476 [12:36:46<16:26:13, 23.37s/it] + 43%|████▎ | 1945/4476 [12:37:09<16:25:04, 23.35s/it] + 43%|████▎ | 1946/4476 [12:37:33<16:24:07, 23.34s/it] + 43%|████▎ | 1947/4476 [12:37:56<16:23:34, 23.34s/it] + 44%|████▎ | 1948/4476 [12:38:19<16:23:56, 23.35s/it] + 44%|████▎ | 1949/4476 [12:38:43<16:23:04, 23.34s/it] + 44%|████▎ | 1950/4476 [12:39:06<16:22:29, 23.34s/it] + +{'loss': 0.3879, 'learning_rate': 3.0036344030349644e-05, 'epoch': 1.31} + + 44%|████▎ | 1950/4476 [12:39:06<16:22:29, 23.34s/it] + 44%|████▎ | 1951/4476 [12:39:29<16:22:48, 23.35s/it] + 44%|████▎ | 1952/4476 [12:39:53<16:21:36, 23.33s/it] + 44%|████▎ | 1953/4476 [12:40:16<16:19:24, 23.29s/it] + 44%|████▎ | 1954/4476 [12:40:39<16:18:50, 23.29s/it] + 44%|████▎ | 1955/4476 [12:41:03<16:23:19, 23.40s/it] + 44%|████▎ | 1956/4476 [12:41:26<16:20:28, 23.34s/it] + 44%|████▎ | 1957/4476 [12:41:49<16:20:07, 23.35s/it] + 44%|████▎ | 1958/4476 [12:42:13<16:19:11, 23.33s/it] + 44%|████▍ | 1959/4476 [12:42:36<16:15:38, 23.26s/it] + 44%|████▍ | 1960/4476 [12:42:59<16:16:05, 23.28s/it] + +{'loss': 0.3945, 'learning_rate': 2.9864350087077702e-05, 'epoch': 1.31} + + 44%|████▍ | 1960/4476 [12:42:59<16:16:05, 23.28s/it] + 44%|████▍ | 1961/4476 [12:43:23<16:17:26, 23.32s/it] + 44%|████▍ | 1962/4476 [12:43:46<16:16:25, 23.30s/it] + 44%|████▍ | 1963/4476 [12:44:09<16:15:37, 23.29s/it] + 44%|████▍ | 1964/4476 [12:44:33<16:20:05, 23.41s/it] + 44%|████▍ | 1965/4476 [12:44:56<16:22:53, 23.49s/it] + 44%|████▍ | 1966/4476 [12:45:20<16:25:42, 23.56s/it] + 44%|████▍ | 1967/4476 [12:45:44<16:27:13, 23.61s/it] + 44%|████▍ | 1968/4476 [12:46:07<16:24:35, 23.55s/it] + 44%|████▍ | 1969/4476 [12:46:31<16:21:14, 23.48s/it] + 44%|████▍ | 1970/4476 [12:46:54<16:14:55, 23.34s/it] + +{'loss': 0.3909, 'learning_rate': 2.969211651300978e-05, 'epoch': 1.32} + + 44%|████▍ | 1970/4476 [12:46:54<16:14:55, 23.34s/it] + 44%|████▍ | 1971/4476 [12:47:17<16:17:05, 23.40s/it] + 44%|████▍ | 1972/4476 [12:47:41<16:17:17, 23.42s/it] + 44%|████▍ | 1973/4476 [12:48:04<16:14:27, 23.36s/it] + 44%|████▍ | 1974/4476 [12:48:27<16:10:22, 23.27s/it] + 44%|████▍ | 1975/4476 [12:48:50<16:10:56, 23.29s/it] + 44%|████▍ | 1976/4476 [12:49:14<16:16:01, 23.42s/it] + 44%|████▍ | 1977/4476 [12:49:37<16:09:27, 23.28s/it] + 44%|████▍ | 1978/4476 [12:50:00<16:10:09, 23.30s/it] + 44%|████▍ | 1979/4476 [12:50:24<16:11:05, 23.33s/it] + 44%|████▍ | 1980/4476 [12:50:47<16:09:19, 23.30s/it] + +{'loss': 0.3871, 'learning_rate': 2.9519651792828877e-05, 'epoch': 1.33} + + 44%|████▍ | 1980/4476 [12:50:47<16:09:19, 23.30s/it] + 44%|████▍ | 1981/4476 [12:51:10<16:05:09, 23.21s/it] + 44%|████▍ | 1982/4476 [12:51:33<16:06:02, 23.24s/it] + 44%|████▍ | 1983/4476 [12:51:57<16:08:13, 23.30s/it] + 44%|████▍ | 1984/4476 [12:52:20<16:05:31, 23.25s/it] + 44%|████▍ | 1985/4476 [12:52:43<16:05:07, 23.25s/it] + 44%|████▍ | 1986/4476 [12:53:06<16:05:17, 23.26s/it] + 44%|████▍ | 1987/4476 [12:53:30<16:04:48, 23.26s/it] + 44%|████▍ | 1988/4476 [12:53:53<16:01:22, 23.18s/it] + 44%|████▍ | 1989/4476 [12:54:16<16:02:27, 23.22s/it] + 44%|████▍ | 1990/4476 [12:54:39<16:05:04, 23.29s/it] + +{'loss': 0.3803, 'learning_rate': 2.9346964422604846e-05, 'epoch': 1.33} + + 44%|████▍ | 1990/4476 [12:54:39<16:05:04, 23.29s/it] + 44%|████▍ | 1991/4476 [12:55:03<16:04:48, 23.30s/it] + 45%|████▍ | 1992/4476 [12:55:26<16:07:00, 23.36s/it] + 45%|████▍ | 1993/4476 [12:55:50<16:09:58, 23.44s/it] + 45%|████▍ | 1994/4476 [12:56:13<16:09:05, 23.43s/it] + 45%|████▍ | 1995/4476 [12:56:36<16:05:10, 23.34s/it] + 45%|████▍ | 1996/4476 [12:57:00<16:03:10, 23.30s/it] + 45%|████▍ | 1997/4476 [12:57:23<16:00:20, 23.24s/it] + 45%|████▍ | 1998/4476 [12:57:46<15:59:52, 23.24s/it] + 45%|████▍ | 1999/4476 [12:58:09<16:01:24, 23.29s/it] + 45%|████▍ | 2000/4476 [12:58:33<16:03:03, 23.34s/it] + +{'loss': 0.3868, 'learning_rate': 2.9174062909375892e-05, 'epoch': 1.34} + + 45%|████▍ | 2000/4476 [12:58:33<16:03:03, 23.34s/it][INFO|trainer.py:2939] 2023-11-12 16:22:20,495 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2000 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 16:22:20,536 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2000/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 16:22:20,536 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2000/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 16:22:20,536 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2000/added_tokens.json + + 45%|████▍ | 2001/4476 [12:58:56<16:05:16, 23.40s/it] + 45%|████▍ | 2002/4476 [12:59:20<16:05:51, 23.42s/it] + 45%|████▍ | 2003/4476 [12:59:43<16:02:29, 23.35s/it] + 45%|████▍ | 2004/4476 [13:00:06<16:01:12, 23.33s/it] + 45%|████▍ | 2005/4476 [13:00:29<15:56:44, 23.23s/it] + 45%|████▍ | 2006/4476 [13:00:53<15:57:58, 23.27s/it] + 45%|████▍ | 2007/4476 [13:01:16<15:56:17, 23.24s/it] + 45%|████▍ | 2008/4476 [13:01:39<15:56:22, 23.25s/it] + 45%|████▍ | 2009/4476 [13:02:02<15:55:23, 23.24s/it] + 45%|████▍ | 2010/4476 [13:02:26<15:55:48, 23.26s/it] + +{'loss': 0.385, 'learning_rate': 2.9000955770729464e-05, 'epoch': 1.35} + + 45%|████▍ | 2010/4476 [13:02:26<15:55:48, 23.26s/it] + 45%|████▍ | 2011/4476 [13:02:49<15:57:14, 23.30s/it] + 45%|████▍ | 2012/4476 [13:03:13<16:01:42, 23.42s/it] + 45%|████▍ | 2013/4476 [13:03:36<15:58:35, 23.35s/it] + 45%|████▍ | 2014/4476 [13:03:59<15:58:34, 23.36s/it] + 45%|████▌ | 2015/4476 [13:04:22<15:56:45, 23.33s/it] + 45%|████▌ | 2016/4476 [13:04:46<15:57:05, 23.34s/it] + 45%|████▌ | 2017/4476 [13:05:09<15:59:08, 23.40s/it] + 45%|████▌ | 2018/4476 [13:05:33<15:57:16, 23.37s/it] + 45%|████▌ | 2019/4476 [13:05:56<15:57:00, 23.37s/it] + 45%|████▌ | 2020/4476 [13:06:19<15:56:14, 23.36s/it] + +{'loss': 0.3871, 'learning_rate': 2.8827651534382655e-05, 'epoch': 1.35} + + 45%|████▌ | 2020/4476 [13:06:19<15:56:14, 23.36s/it] + 45%|████▌ | 2021/4476 [13:06:43<15:54:11, 23.32s/it] + 45%|████▌ | 2022/4476 [13:07:06<15:56:00, 23.37s/it] + 45%|████▌ | 2023/4476 [13:07:29<15:52:59, 23.31s/it] + 45%|████▌ | 2024/4476 [13:07:53<15:52:47, 23.31s/it] + 45%|████▌ | 2025/4476 [13:08:16<15:54:04, 23.36s/it] + 45%|████▌ | 2026/4476 [13:08:39<15:53:11, 23.34s/it] + 45%|████▌ | 2027/4476 [13:09:03<15:55:47, 23.42s/it] + 45%|████▌ | 2028/4476 [13:09:26<15:54:45, 23.40s/it] + 45%|████▌ | 2029/4476 [13:09:50<15:54:47, 23.41s/it] + 45%|████▌ | 2030/4476 [13:10:13<15:53:39, 23.39s/it] + +{'loss': 0.3956, 'learning_rate': 2.8654158737762122e-05, 'epoch': 1.36} + + 45%|████▌ | 2030/4476 [13:10:13<15:53:39, 23.39s/it] + 45%|████▌ | 2031/4476 [13:10:36<15:52:44, 23.38s/it] + 45%|████▌ | 2032/4476 [13:11:00<15:52:04, 23.37s/it] + 45%|████▌ | 2033/4476 [13:11:23<15:51:54, 23.38s/it] + 45%|████▌ | 2034/4476 [13:11:47<15:50:41, 23.36s/it] + 45%|████▌ | 2035/4476 [13:12:10<15:48:28, 23.31s/it] + 45%|████▌ | 2036/4476 [13:12:33<15:47:37, 23.30s/it] + 46%|████▌ | 2037/4476 [13:12:56<15:41:25, 23.16s/it] + 46%|████▌ | 2038/4476 [13:13:19<15:42:54, 23.21s/it] + 46%|████▌ | 2039/4476 [13:13:42<15:43:59, 23.24s/it] + 46%|████▌ | 2040/4476 [13:14:06<15:41:09, 23.18s/it] + +{'loss': 0.3884, 'learning_rate': 2.8480485927583506e-05, 'epoch': 1.37} + + 46%|████▌ | 2040/4476 [13:14:06<15:41:09, 23.18s/it] + 46%|████▌ | 2041/4476 [13:14:29<15:44:09, 23.26s/it] + 46%|████▌ | 2042/4476 [13:14:52<15:46:22, 23.33s/it] + 46%|████▌ | 2043/4476 [13:15:16<15:44:51, 23.30s/it] + 46%|████▌ | 2044/4476 [13:15:39<15:48:58, 23.41s/it] + 46%|████▌ | 2045/4476 [13:16:02<15:44:01, 23.30s/it] + 46%|████▌ | 2046/4476 [13:16:26<15:43:52, 23.31s/it] + 46%|████▌ | 2047/4476 [13:16:49<15:43:37, 23.31s/it] + 46%|████▌ | 2048/4476 [13:17:12<15:42:30, 23.29s/it] + 46%|████▌ | 2049/4476 [13:17:36<15:44:32, 23.35s/it] + 46%|████▌ | 2050/4476 [13:17:59<15:43:26, 23.33s/it] + +{'loss': 0.3829, 'learning_rate': 2.8306641659430382e-05, 'epoch': 1.37} + + 46%|████▌ | 2050/4476 [13:17:59<15:43:26, 23.33s/it] + 46%|████▌ | 2051/4476 [13:18:23<15:47:10, 23.44s/it] + 46%|████▌ | 2052/4476 [13:18:46<15:47:11, 23.45s/it] + 46%|████▌ | 2053/4476 [13:19:10<15:49:49, 23.52s/it] + 46%|████▌ | 2054/4476 [13:19:33<15:49:38, 23.53s/it] + 46%|████▌ | 2055/4476 [13:19:57<15:49:24, 23.53s/it] + 46%|████▌ | 2056/4476 [13:20:21<15:49:29, 23.54s/it] + 46%|████▌ | 2057/4476 [13:20:43<15:41:28, 23.35s/it] + 46%|████▌ | 2058/4476 [13:21:07<15:41:16, 23.36s/it] + 46%|████▌ | 2059/4476 [13:21:30<15:41:37, 23.38s/it] + 46%|████▌ | 2060/4476 [13:21:54<15:45:54, 23.49s/it] + +{'loss': 0.3916, 'learning_rate': 2.8132634497332815e-05, 'epoch': 1.38} + + 46%|████▌ | 2060/4476 [13:21:54<15:45:54, 23.49s/it] + 46%|████▌ | 2061/4476 [13:22:17<15:40:38, 23.37s/it] + 46%|████▌ | 2062/4476 [13:22:40<15:39:27, 23.35s/it] + 46%|████▌ | 2063/4476 [13:23:04<15:37:05, 23.30s/it] + 46%|████▌ | 2064/4476 [13:23:27<15:38:55, 23.36s/it] + 46%|████▌ | 2065/4476 [13:23:51<15:40:54, 23.42s/it] + 46%|████▌ | 2066/4476 [13:24:14<15:40:54, 23.43s/it] + 46%|████▌ | 2067/4476 [13:24:37<15:35:01, 23.29s/it] + 46%|████▌ | 2068/4476 [13:25:01<15:37:13, 23.35s/it] + 46%|████▌ | 2069/4476 [13:25:24<15:34:26, 23.29s/it] + 46%|████▌ | 2070/4476 [13:25:47<15:34:08, 23.30s/it] + +{'loss': 0.3924, 'learning_rate': 2.7958473013345447e-05, 'epoch': 1.39} + + 46%|████▌ | 2070/4476 [13:25:47<15:34:08, 23.30s/it] + 46%|████▋ | 2071/4476 [13:26:10<15:34:38, 23.32s/it] + 46%|████▋ | 2072/4476 [13:26:34<15:34:43, 23.33s/it] + 46%|████▋ | 2073/4476 [13:26:57<15:29:05, 23.20s/it] + 46%|████▋ | 2074/4476 [13:27:20<15:25:14, 23.11s/it] + 46%|████▋ | 2075/4476 [13:27:43<15:28:57, 23.21s/it] + 46%|████▋ | 2076/4476 [13:28:06<15:29:13, 23.23s/it] + 46%|████▋ | 2077/4476 [13:28:30<15:29:38, 23.25s/it] + 46%|████▋ | 2078/4476 [13:28:53<15:26:26, 23.18s/it] + 46%|████▋ | 2079/4476 [13:29:16<15:30:33, 23.29s/it] + 46%|████▋ | 2080/4476 [13:29:39<15:27:58, 23.24s/it] + +{'loss': 0.3906, 'learning_rate': 2.7784165787125226e-05, 'epoch': 1.39} + + 46%|████▋ | 2080/4476 [13:29:39<15:27:58, 23.24s/it] + 46%|████▋ | 2081/4476 [13:30:02<15:25:35, 23.19s/it] + 47%|████▋ | 2082/4476 [13:30:25<15:23:06, 23.14s/it] + 47%|████▋ | 2083/4476 [13:30:48<15:21:30, 23.11s/it] + 47%|████▋ | 2084/4476 [13:31:12<15:28:37, 23.29s/it] + 47%|████▋ | 2085/4476 [13:31:35<15:27:12, 23.27s/it] + 47%|████▋ | 2086/4476 [13:31:59<15:28:54, 23.32s/it] + 47%|████▋ | 2087/4476 [13:32:22<15:27:28, 23.29s/it] + 47%|████▋ | 2088/4476 [13:32:45<15:28:37, 23.33s/it] + 47%|████▋ | 2089/4476 [13:33:09<15:30:58, 23.40s/it] + 47%|████▋ | 2090/4476 [13:33:32<15:31:40, 23.43s/it] + +{'loss': 0.383, 'learning_rate': 2.7609721405508758e-05, 'epoch': 1.4} + + 47%|████▋ | 2090/4476 [13:33:32<15:31:40, 23.43s/it] + 47%|████▋ | 2091/4476 [13:33:56<15:27:20, 23.33s/it] + 47%|████▋ | 2092/4476 [13:34:19<15:29:30, 23.39s/it] + 47%|████▋ | 2093/4476 [13:34:43<15:32:44, 23.48s/it] + 47%|████▋ | 2094/4476 [13:35:06<15:30:32, 23.44s/it] + 47%|████▋ | 2095/4476 [13:35:29<15:27:43, 23.38s/it] + 47%|████▋ | 2096/4476 [13:35:53<15:31:32, 23.48s/it] + 47%|████▋ | 2097/4476 [13:36:16<15:27:58, 23.40s/it] + 47%|████▋ | 2098/4476 [13:36:39<15:18:48, 23.18s/it] + 47%|████▋ | 2099/4476 [13:37:02<15:16:58, 23.15s/it] + 47%|████▋ | 2100/4476 [13:37:25<15:19:39, 23.22s/it] + +{'loss': 0.3892, 'learning_rate': 2.7435148462089282e-05, 'epoch': 1.41} + + 47%|████▋ | 2100/4476 [13:37:25<15:19:39, 23.22s/it] + 47%|████▋ | 2101/4476 [13:37:49<15:22:07, 23.30s/it] + 47%|████▋ | 2102/4476 [13:38:13<15:27:16, 23.44s/it] + 47%|████▋ | 2103/4476 [13:38:36<15:23:56, 23.36s/it] + 47%|████▋ | 2104/4476 [13:38:59<15:22:58, 23.35s/it] + 47%|████▋ | 2105/4476 [13:39:23<15:22:49, 23.35s/it] + 47%|████▋ | 2106/4476 [13:39:46<15:22:39, 23.36s/it] + 47%|████▋ | 2107/4476 [13:40:09<15:24:34, 23.42s/it] + 47%|████▋ | 2108/4476 [13:40:33<15:27:03, 23.49s/it] + 47%|████▋ | 2109/4476 [13:40:56<15:25:16, 23.45s/it] + 47%|████▋ | 2110/4476 [13:41:20<15:20:53, 23.35s/it] + +{'loss': 0.3866, 'learning_rate': 2.7260455556793325e-05, 'epoch': 1.41} + + 47%|████▋ | 2110/4476 [13:41:20<15:20:53, 23.35s/it] + 47%|████▋ | 2111/4476 [13:41:43<15:19:23, 23.32s/it] + 47%|████▋ | 2112/4476 [13:42:06<15:18:56, 23.32s/it] + 47%|████▋ | 2113/4476 [13:42:30<15:20:38, 23.38s/it] + 47%|████▋ | 2114/4476 [13:42:53<15:18:21, 23.33s/it] + 47%|████▋ | 2115/4476 [13:43:16<15:17:31, 23.32s/it] + 47%|████▋ | 2116/4476 [13:43:39<15:17:03, 23.32s/it] + 47%|████▋ | 2117/4476 [13:44:03<15:16:39, 23.31s/it] + 47%|████▋ | 2118/4476 [13:44:27<15:21:32, 23.45s/it] + 47%|████▋ | 2119/4476 [13:44:50<15:19:16, 23.40s/it] + 47%|████▋ | 2120/4476 [13:45:13<15:15:54, 23.33s/it] + +{'loss': 0.382, 'learning_rate': 2.708565129545706e-05, 'epoch': 1.42} + + 47%|████▋ | 2120/4476 [13:45:13<15:15:54, 23.33s/it] + 47%|████▋ | 2121/4476 [13:45:36<15:16:00, 23.34s/it] + 47%|████▋ | 2122/4476 [13:46:00<15:17:04, 23.37s/it] + 47%|████▋ | 2123/4476 [13:46:23<15:18:50, 23.43s/it] + 47%|████▋ | 2124/4476 [13:46:46<15:12:24, 23.28s/it] + 47%|████▋ | 2125/4476 [13:47:09<15:09:31, 23.21s/it] + 47%|████▋ | 2126/4476 [13:47:33<15:11:10, 23.26s/it] + 48%|████▊ | 2127/4476 [13:47:56<15:11:49, 23.29s/it] + 48%|████▊ | 2128/4476 [13:48:19<15:10:57, 23.28s/it] + 48%|████▊ | 2129/4476 [13:48:43<15:13:27, 23.35s/it] + 48%|████▊ | 2130/4476 [13:49:06<15:11:58, 23.32s/it] + +{'loss': 0.3825, 'learning_rate': 2.691074428940237e-05, 'epoch': 1.43} + + 48%|████▊ | 2130/4476 [13:49:06<15:11:58, 23.32s/it] + 48%|████▊ | 2131/4476 [13:49:30<15:15:57, 23.44s/it] + 48%|████▊ | 2132/4476 [13:49:53<15:12:01, 23.35s/it] + 48%|████▊ | 2133/4476 [13:50:16<15:11:01, 23.33s/it] + 48%|████▊ | 2134/4476 [13:50:39<15:08:43, 23.28s/it] + 48%|████▊ | 2135/4476 [13:51:03<15:08:11, 23.28s/it] + 48%|████▊ | 2136/4476 [13:51:26<15:08:53, 23.30s/it] + 48%|████▊ | 2137/4476 [13:51:49<15:08:06, 23.29s/it] + 48%|████▊ | 2138/4476 [13:52:13<15:09:01, 23.33s/it] + 48%|████▊ | 2139/4476 [13:52:36<15:08:43, 23.33s/it] + 48%|████▊ | 2140/4476 [13:53:00<15:12:53, 23.45s/it] + +{'loss': 0.3828, 'learning_rate': 2.673574315501259e-05, 'epoch': 1.43} + + 48%|████▊ | 2140/4476 [13:53:00<15:12:53, 23.45s/it] + 48%|████▊ | 2141/4476 [13:53:23<15:12:39, 23.45s/it] + 48%|████▊ | 2142/4476 [13:53:47<15:11:15, 23.43s/it] + 48%|████▊ | 2143/4476 [13:54:09<15:03:46, 23.24s/it] + 48%|████▊ | 2144/4476 [13:54:33<15:03:19, 23.24s/it] + 48%|████▊ | 2145/4476 [13:54:56<15:02:10, 23.22s/it] + 48%|████▊ | 2146/4476 [13:55:19<15:00:59, 23.20s/it] + 48%|████▊ | 2147/4476 [13:55:43<15:03:54, 23.29s/it] + 48%|████▊ | 2148/4476 [13:56:06<15:03:30, 23.29s/it] + 48%|████▊ | 2149/4476 [13:56:29<15:03:54, 23.31s/it] + 48%|████▊ | 2150/4476 [13:56:53<15:05:19, 23.35s/it] + +{'loss': 0.3845, 'learning_rate': 2.656065651330808e-05, 'epoch': 1.44} + + 48%|████▊ | 2150/4476 [13:56:53<15:05:19, 23.35s/it] + 48%|████▊ | 2151/4476 [13:57:16<15:04:27, 23.34s/it] + 48%|████▊ | 2152/4476 [13:57:39<15:05:05, 23.37s/it] + 48%|████▊ | 2153/4476 [13:58:03<15:07:50, 23.45s/it] + 48%|████▊ | 2154/4476 [13:58:26<15:03:55, 23.36s/it] + 48%|████▊ | 2155/4476 [13:58:49<15:01:45, 23.31s/it] + 48%|████▊ | 2156/4476 [13:59:13<15:02:05, 23.33s/it] + 48%|████▊ | 2157/4476 [13:59:36<14:56:10, 23.19s/it] + 48%|████▊ | 2158/4476 [13:59:59<14:58:26, 23.26s/it] + 48%|████▊ | 2159/4476 [14:00:22<14:58:05, 23.26s/it] + 48%|████▊ | 2160/4476 [14:00:46<15:03:54, 23.42s/it] + +{'loss': 0.3904, 'learning_rate': 2.6385492989521522e-05, 'epoch': 1.45} + + 48%|████▊ | 2160/4476 [14:00:46<15:03:54, 23.42s/it] + 48%|████▊ | 2161/4476 [14:01:09<15:03:24, 23.41s/it] + 48%|████▊ | 2162/4476 [14:01:32<14:58:37, 23.30s/it] + 48%|████▊ | 2163/4476 [14:01:56<14:59:08, 23.32s/it] + 48%|████▊ | 2164/4476 [14:02:19<14:57:47, 23.30s/it] + 48%|████▊ | 2165/4476 [14:02:42<14:57:56, 23.31s/it] + 48%|████▊ | 2166/4476 [14:03:06<14:57:36, 23.31s/it] + 48%|████▊ | 2167/4476 [14:03:29<14:55:21, 23.27s/it] + 48%|████▊ | 2168/4476 [14:03:52<14:54:52, 23.26s/it] + 48%|████▊ | 2169/4476 [14:04:16<14:57:47, 23.35s/it] + 48%|████▊ | 2170/4476 [14:04:39<14:56:41, 23.33s/it] + +{'loss': 0.3934, 'learning_rate': 2.6210261212673004e-05, 'epoch': 1.45} + + 48%|████▊ | 2170/4476 [14:04:39<14:56:41, 23.33s/it] + 49%|████▊ | 2171/4476 [14:05:02<14:55:05, 23.30s/it] + 49%|████▊ | 2172/4476 [14:05:26<14:56:31, 23.35s/it] + 49%|████▊ | 2173/4476 [14:05:49<14:58:30, 23.41s/it] + 49%|████▊ | 2174/4476 [14:06:12<14:53:06, 23.28s/it] + 49%|████▊ | 2175/4476 [14:06:36<14:57:59, 23.42s/it] + 49%|████▊ | 2176/4476 [14:06:59<14:55:19, 23.36s/it] + 49%|████▊ | 2177/4476 [14:07:23<14:57:30, 23.42s/it] + 49%|████▊ | 2178/4476 [14:07:46<15:00:51, 23.52s/it] + 49%|████▊ | 2179/4476 [14:08:10<15:03:23, 23.60s/it] + 49%|████▊ | 2180/4476 [14:08:33<14:58:42, 23.49s/it] + +{'loss': 0.3893, 'learning_rate': 2.6034969815144938e-05, 'epoch': 1.46} + + 49%|████▊ | 2180/4476 [14:08:33<14:58:42, 23.49s/it] + 49%|████▊ | 2181/4476 [14:08:57<14:53:33, 23.36s/it] + 49%|████▊ | 2182/4476 [14:09:20<14:55:18, 23.42s/it] + 49%|████▉ | 2183/4476 [14:09:43<14:46:42, 23.20s/it] + 49%|████▉ | 2184/4476 [14:10:06<14:46:12, 23.20s/it] + 49%|████▉ | 2185/4476 [14:10:29<14:43:26, 23.14s/it] + 49%|████▉ | 2186/4476 [14:10:53<14:49:16, 23.30s/it] + 49%|████▉ | 2187/4476 [14:11:16<14:48:20, 23.29s/it] + 49%|████▉ | 2188/4476 [14:11:39<14:48:00, 23.29s/it] + 49%|████▉ | 2189/4476 [14:12:03<14:52:59, 23.43s/it] + 49%|████▉ | 2190/4476 [14:12:26<14:52:04, 23.41s/it] + +{'loss': 0.3965, 'learning_rate': 2.5859627432256816e-05, 'epoch': 1.47} + + 49%|████▉ | 2190/4476 [14:12:26<14:52:04, 23.41s/it] + 49%|████▉ | 2191/4476 [14:12:50<14:49:46, 23.36s/it] + 49%|████▉ | 2192/4476 [14:13:13<14:47:52, 23.32s/it] + 49%|████▉ | 2193/4476 [14:13:37<14:52:19, 23.45s/it] + 49%|████▉ | 2194/4476 [14:14:00<14:48:30, 23.36s/it] + 49%|████▉ | 2195/4476 [14:14:23<14:47:29, 23.34s/it] + 49%|████▉ | 2196/4476 [14:14:47<14:50:33, 23.44s/it] + 49%|████▉ | 2197/4476 [14:15:10<14:51:41, 23.48s/it] + 49%|████▉ | 2198/4476 [14:15:33<14:48:21, 23.40s/it] + 49%|████▉ | 2199/4476 [14:15:57<14:44:30, 23.31s/it] + 49%|████▉ | 2200/4476 [14:16:20<14:44:04, 23.31s/it] + +{'loss': 0.3833, 'learning_rate': 2.568424270183981e-05, 'epoch': 1.47} + + 49%|████▉ | 2200/4476 [14:16:20<14:44:04, 23.31s/it][INFO|trainer.py:2939] 2023-11-12 17:40:07,612 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2200 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 17:40:07,643 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 17:40:07,643 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 17:40:07,643 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2200/added_tokens.json + + 49%|████▉ | 2201/4476 [14:16:43<14:42:15, 23.27s/it] + 49%|████▉ | 2202/4476 [14:17:06<14:42:02, 23.27s/it] + 49%|████▉ | 2203/4476 [14:17:30<14:43:40, 23.33s/it] + 49%|████▉ | 2204/4476 [14:17:53<14:42:31, 23.31s/it] + 49%|████▉ | 2205/4476 [14:18:17<14:44:34, 23.37s/it] + 49%|████▉ | 2206/4476 [14:18:40<14:44:41, 23.38s/it] + 49%|████▉ | 2207/4476 [14:19:03<14:42:53, 23.35s/it] + 49%|████▉ | 2208/4476 [14:19:27<14:43:51, 23.38s/it] + 49%|████▉ | 2209/4476 [14:19:50<14:43:12, 23.38s/it] + 49%|████▉ | 2210/4476 [14:20:13<14:41:16, 23.33s/it] + +{'loss': 0.3822, 'learning_rate': 2.550882426381123e-05, 'epoch': 1.48} + + 49%|████▉ | 2210/4476 [14:20:13<14:41:16, 23.33s/it] + 49%|████▉ | 2211/4476 [14:20:36<14:37:47, 23.25s/it] + 49%|████▉ | 2212/4476 [14:21:00<14:38:05, 23.27s/it] + 49%|████▉ | 2213/4476 [14:21:23<14:37:28, 23.26s/it] + 49%|████▉ | 2214/4476 [14:21:46<14:38:18, 23.30s/it] + 49%|████▉ | 2215/4476 [14:22:10<14:41:04, 23.38s/it] + 50%|████▉ | 2216/4476 [14:22:33<14:40:08, 23.37s/it] + 50%|████▉ | 2217/4476 [14:22:57<14:38:50, 23.34s/it] + 50%|████▉ | 2218/4476 [14:23:20<14:35:18, 23.26s/it] + 50%|████▉ | 2219/4476 [14:23:43<14:32:32, 23.20s/it] + 50%|████▉ | 2220/4476 [14:24:06<14:32:16, 23.20s/it] + +{'loss': 0.3823, 'learning_rate': 2.5333380759748925e-05, 'epoch': 1.49} + + 50%|████▉ | 2220/4476 [14:24:06<14:32:16, 23.20s/it] + 50%|████▉ | 2221/4476 [14:24:29<14:31:32, 23.19s/it] + 50%|████▉ | 2222/4476 [14:24:52<14:34:19, 23.27s/it] + 50%|████▉ | 2223/4476 [14:25:16<14:36:16, 23.34s/it] + 50%|████▉ | 2224/4476 [14:25:40<14:39:10, 23.42s/it] + 50%|████▉ | 2225/4476 [14:26:03<14:34:39, 23.31s/it] + 50%|████▉ | 2226/4476 [14:26:26<14:35:57, 23.36s/it] + 50%|████▉ | 2227/4476 [14:26:49<14:33:20, 23.30s/it] + 50%|████▉ | 2228/4476 [14:27:12<14:31:55, 23.27s/it] + 50%|████▉ | 2229/4476 [14:27:36<14:31:56, 23.28s/it] + 50%|████▉ | 2230/4476 [14:27:59<14:32:45, 23.31s/it] + +{'loss': 0.3861, 'learning_rate': 2.515792083246556e-05, 'epoch': 1.49} + + 50%|████▉ | 2230/4476 [14:27:59<14:32:45, 23.31s/it] + 50%|████▉ | 2231/4476 [14:28:22<14:30:00, 23.25s/it] + 50%|████▉ | 2232/4476 [14:28:46<14:30:45, 23.28s/it] + 50%|████▉ | 2233/4476 [14:29:09<14:27:33, 23.21s/it] + 50%|████▉ | 2234/4476 [14:29:32<14:29:53, 23.28s/it] + 50%|████▉ | 2235/4476 [14:29:55<14:28:45, 23.26s/it] + 50%|████▉ | 2236/4476 [14:30:19<14:29:59, 23.30s/it] + 50%|████▉ | 2237/4476 [14:30:42<14:26:54, 23.23s/it] + 50%|█████ | 2238/4476 [14:31:05<14:27:24, 23.25s/it] + 50%|█████ | 2239/4476 [14:31:28<14:26:50, 23.25s/it] + 50%|█████ | 2240/4476 [14:31:51<14:24:21, 23.19s/it] + +{'loss': 0.3861, 'learning_rate': 2.4982453125582834e-05, 'epoch': 1.5} + + 50%|█████ | 2240/4476 [14:31:51<14:24:21, 23.19s/it] + 50%|█████ | 2241/4476 [14:32:14<14:20:59, 23.11s/it] + 50%|█████ | 2242/4476 [14:32:38<14:23:42, 23.20s/it] + 50%|█████ | 2243/4476 [14:33:01<14:21:02, 23.14s/it] + 50%|█████ | 2244/4476 [14:33:24<14:24:11, 23.23s/it] + 50%|█████ | 2245/4476 [14:33:47<14:23:10, 23.21s/it] + 50%|█████ | 2246/4476 [14:34:11<14:24:40, 23.26s/it] + 50%|█████ | 2247/4476 [14:34:34<14:27:35, 23.35s/it] + 50%|█████ | 2248/4476 [14:34:58<14:27:44, 23.37s/it] + 50%|█████ | 2249/4476 [14:35:21<14:31:39, 23.48s/it] + 50%|█████ | 2250/4476 [14:35:45<14:32:56, 23.53s/it] + +{'loss': 0.3909, 'learning_rate': 2.4806986283105712e-05, 'epoch': 1.51} + + 50%|█████ | 2250/4476 [14:35:45<14:32:56, 23.53s/it] + 50%|█████ | 2251/4476 [14:36:08<14:30:02, 23.46s/it] + 50%|█████ | 2252/4476 [14:36:32<14:29:59, 23.47s/it] + 50%|█████ | 2253/4476 [14:36:55<14:25:55, 23.37s/it] + 50%|█████ | 2254/4476 [14:37:18<14:20:03, 23.22s/it] + 50%|█████ | 2255/4476 [14:37:41<14:20:21, 23.24s/it] + 50%|█████ | 2256/4476 [14:38:05<14:23:29, 23.34s/it] + 50%|█████ | 2257/4476 [14:38:28<14:24:31, 23.38s/it] + 50%|█████ | 2258/4476 [14:38:52<14:28:46, 23.50s/it] + 50%|█████ | 2259/4476 [14:39:16<14:29:21, 23.53s/it] + 50%|█████ | 2260/4476 [14:39:39<14:32:01, 23.61s/it] + +{'loss': 0.3854, 'learning_rate': 2.463152894899658e-05, 'epoch': 1.51} + + 50%|█████ | 2260/4476 [14:39:39<14:32:01, 23.61s/it] + 51%|█████ | 2261/4476 [14:40:03<14:30:08, 23.57s/it] + 51%|█████ | 2262/4476 [14:40:26<14:26:00, 23.47s/it] + 51%|█████ | 2263/4476 [14:40:50<14:28:53, 23.56s/it] + 51%|█████ | 2264/4476 [14:41:13<14:25:26, 23.47s/it] + 51%|█████ | 2265/4476 [14:41:36<14:23:02, 23.42s/it] + 51%|█████ | 2266/4476 [14:42:00<14:18:55, 23.32s/it] + 51%|█████ | 2267/4476 [14:42:23<14:19:16, 23.34s/it] + 51%|█████ | 2268/4476 [14:42:46<14:18:33, 23.33s/it] + 51%|█████ | 2269/4476 [14:43:10<14:20:13, 23.39s/it] + 51%|█████ | 2270/4476 [14:43:33<14:16:37, 23.30s/it] + +{'loss': 0.394, 'learning_rate': 2.445608976674939e-05, 'epoch': 1.52} + + 51%|█████ | 2270/4476 [14:43:33<14:16:37, 23.30s/it] + 51%|█████ | 2271/4476 [14:43:57<14:20:35, 23.42s/it] + 51%|█████ | 2272/4476 [14:44:20<14:21:45, 23.46s/it] + 51%|█████ | 2273/4476 [14:44:43<14:19:24, 23.41s/it] + 51%|█████ | 2274/4476 [14:45:07<14:18:40, 23.40s/it] + 51%|█████ | 2275/4476 [14:45:30<14:18:37, 23.41s/it] + 51%|█████ | 2276/4476 [14:45:53<14:15:46, 23.34s/it] + 51%|█████ | 2277/4476 [14:46:17<14:15:40, 23.35s/it] + 51%|█████ | 2278/4476 [14:46:40<14:13:53, 23.31s/it] + 51%|█████ | 2279/4476 [14:47:03<14:12:47, 23.29s/it] + 51%|█████ | 2280/4476 [14:47:26<14:12:21, 23.29s/it] + +{'loss': 0.3866, 'learning_rate': 2.4280677378963906e-05, 'epoch': 1.53} + + 51%|█████ | 2280/4476 [14:47:26<14:12:21, 23.29s/it] + 51%|█████ | 2281/4476 [14:47:50<14:12:00, 23.29s/it] + 51%|█████ | 2282/4476 [14:48:13<14:11:09, 23.28s/it] + 51%|█████ | 2283/4476 [14:48:36<14:10:56, 23.28s/it] + 51%|█████ | 2284/4476 [14:49:00<14:12:24, 23.33s/it] + 51%|█████ | 2285/4476 [14:49:23<14:11:28, 23.32s/it] + 51%|█████ | 2286/4476 [14:49:46<14:12:28, 23.36s/it] + 51%|█████ | 2287/4476 [14:50:10<14:11:21, 23.34s/it] + 51%|█████ | 2288/4476 [14:50:33<14:07:44, 23.25s/it] + 51%|█████ | 2289/4476 [14:50:56<14:07:01, 23.24s/it] + 51%|█████ | 2290/4476 [14:51:20<14:11:52, 23.38s/it] + +{'loss': 0.3879, 'learning_rate': 2.410530042691992e-05, 'epoch': 1.53} + + 51%|█████ | 2290/4476 [14:51:20<14:11:52, 23.38s/it] + 51%|█████ | 2291/4476 [14:51:43<14:08:39, 23.30s/it] + 51%|█████ | 2292/4476 [14:52:06<14:09:17, 23.33s/it] + 51%|█████ | 2293/4476 [14:52:30<14:08:40, 23.33s/it] + 51%|█████▏ | 2294/4476 [14:52:53<14:07:17, 23.30s/it] + 51%|█████▏ | 2295/4476 [14:53:16<14:08:30, 23.34s/it] + 51%|█████▏ | 2296/4476 [14:53:40<14:08:38, 23.36s/it] + 51%|█████▏ | 2297/4476 [14:54:03<14:07:09, 23.33s/it] + 51%|█████▏ | 2298/4476 [14:54:26<14:04:35, 23.27s/it] + 51%|█████▏ | 2299/4476 [14:54:49<14:04:06, 23.26s/it] + 51%|█████▏ | 2300/4476 [14:55:12<14:02:49, 23.24s/it] + +{'loss': 0.388, 'learning_rate': 2.3929967550151568e-05, 'epoch': 1.54} + + 51%|█████▏ | 2300/4476 [14:55:12<14:02:49, 23.24s/it] + 51%|█████▏ | 2301/4476 [14:55:36<14:02:32, 23.24s/it] + 51%|█████▏ | 2302/4476 [14:55:59<13:58:31, 23.14s/it] + 51%|█████▏ | 2303/4476 [14:56:22<14:00:33, 23.21s/it] + 51%|█████▏ | 2304/4476 [14:56:46<14:03:32, 23.30s/it] + 51%|█████▏ | 2305/4476 [14:57:09<14:02:38, 23.29s/it] + 52%|█████▏ | 2306/4476 [14:57:32<14:03:40, 23.33s/it] + 52%|█████▏ | 2307/4476 [14:57:56<14:04:29, 23.36s/it] + 52%|█████▏ | 2308/4476 [14:58:18<13:57:18, 23.17s/it] + 52%|█████▏ | 2309/4476 [14:58:41<13:56:00, 23.15s/it] + 52%|█████▏ | 2310/4476 [14:59:05<14:01:02, 23.30s/it] + +{'loss': 0.3868, 'learning_rate': 2.375468738602171e-05, 'epoch': 1.55} + + 52%|█████▏ | 2310/4476 [14:59:05<14:01:02, 23.30s/it] + 52%|█████▏ | 2311/4476 [14:59:28<14:00:25, 23.29s/it] + 52%|█████▏ | 2312/4476 [14:59:52<14:02:39, 23.36s/it] + 52%|█████▏ | 2313/4476 [15:00:15<13:58:27, 23.26s/it] + 52%|█████▏ | 2314/4476 [15:00:38<13:57:14, 23.24s/it] + 52%|█████▏ | 2315/4476 [15:01:02<13:59:28, 23.31s/it] + 52%|█████▏ | 2316/4476 [15:01:25<14:00:25, 23.35s/it] + 52%|█████▏ | 2317/4476 [15:01:48<13:59:40, 23.34s/it] + 52%|█████▏ | 2318/4476 [15:02:12<13:59:14, 23.33s/it] + 52%|█████▏ | 2319/4476 [15:02:35<13:55:46, 23.25s/it] + 52%|█████▏ | 2320/4476 [15:02:58<13:56:00, 23.27s/it] + +{'loss': 0.3769, 'learning_rate': 2.3579468569296464e-05, 'epoch': 1.55} + + 52%|█████▏ | 2320/4476 [15:02:58<13:56:00, 23.27s/it] + 52%|█████▏ | 2321/4476 [15:03:21<13:55:17, 23.26s/it] + 52%|█████▏ | 2322/4476 [15:03:45<13:58:07, 23.35s/it] + 52%|█████▏ | 2323/4476 [15:04:08<14:00:41, 23.43s/it] + 52%|█████▏ | 2324/4476 [15:04:32<13:59:15, 23.40s/it] + 52%|█████▏ | 2325/4476 [15:04:55<14:02:46, 23.51s/it] + 52%|█████▏ | 2326/4476 [15:05:19<14:00:02, 23.44s/it] + 52%|█████▏ | 2327/4476 [15:05:42<13:57:58, 23.40s/it] + 52%|█████▏ | 2328/4476 [15:06:06<13:59:16, 23.44s/it] + 52%|█████▏ | 2329/4476 [15:06:29<13:57:30, 23.40s/it] + 52%|█████▏ | 2330/4476 [15:06:52<13:57:27, 23.41s/it] + +{'loss': 0.3811, 'learning_rate': 2.340431973171978e-05, 'epoch': 1.56} + + 52%|█████▏ | 2330/4476 [15:06:52<13:57:27, 23.41s/it] + 52%|█████▏ | 2331/4476 [15:07:16<13:55:38, 23.37s/it] + 52%|█████▏ | 2332/4476 [15:07:39<13:54:31, 23.35s/it] + 52%|█████▏ | 2333/4476 [15:08:02<13:53:46, 23.34s/it] + 52%|█████▏ | 2334/4476 [15:08:25<13:49:39, 23.24s/it] + 52%|█████▏ | 2335/4476 [15:08:49<13:52:14, 23.32s/it] + 52%|█████▏ | 2336/4476 [15:09:12<13:52:34, 23.34s/it] + 52%|█████▏ | 2337/4476 [15:09:35<13:51:41, 23.33s/it] + 52%|█████▏ | 2338/4476 [15:09:59<13:51:05, 23.32s/it] + 52%|█████▏ | 2339/4476 [15:10:22<13:50:22, 23.31s/it] + 52%|█████▏ | 2340/4476 [15:10:45<13:49:56, 23.31s/it] + +{'loss': 0.3805, 'learning_rate': 2.3229249501588278e-05, 'epoch': 1.57} + + 52%|█████▏ | 2340/4476 [15:10:45<13:49:56, 23.31s/it] + 52%|█████▏ | 2341/4476 [15:11:09<13:51:22, 23.36s/it] + 52%|█████▏ | 2342/4476 [15:11:33<13:54:34, 23.46s/it] + 52%|█████▏ | 2343/4476 [15:11:56<13:54:49, 23.48s/it] + 52%|█████▏ | 2344/4476 [15:12:20<13:54:04, 23.47s/it] + 52%|█████▏ | 2345/4476 [15:12:43<13:51:39, 23.42s/it] + 52%|█████▏ | 2346/4476 [15:13:06<13:45:32, 23.25s/it] + 52%|█████▏ | 2347/4476 [15:13:29<13:45:18, 23.26s/it] + 52%|█████▏ | 2348/4476 [15:13:52<13:46:06, 23.29s/it] + 52%|█████▏ | 2349/4476 [15:14:16<13:44:56, 23.27s/it] + 53%|█████▎ | 2350/4476 [15:14:39<13:46:06, 23.31s/it] + +{'loss': 0.3822, 'learning_rate': 2.3054266503326165e-05, 'epoch': 1.57} + + 53%|█████▎ | 2350/4476 [15:14:39<13:46:06, 23.31s/it] + 53%|█████▎ | 2351/4476 [15:15:02<13:46:02, 23.32s/it] + 53%|█████▎ | 2352/4476 [15:15:26<13:47:24, 23.37s/it] + 53%|█████▎ | 2353/4476 [15:15:49<13:46:51, 23.37s/it] + 53%|█████▎ | 2354/4476 [15:16:12<13:43:00, 23.27s/it] + 53%|█████▎ | 2355/4476 [15:16:36<13:43:53, 23.31s/it] + 53%|█████▎ | 2356/4476 [15:16:59<13:42:35, 23.28s/it] + 53%|█████▎ | 2357/4476 [15:17:22<13:43:18, 23.31s/it] + 53%|█████▎ | 2358/4476 [15:17:46<13:42:58, 23.31s/it] + 53%|█████▎ | 2359/4476 [15:18:09<13:42:53, 23.32s/it] + 53%|█████▎ | 2360/4476 [15:18:32<13:44:29, 23.38s/it] + +{'loss': 0.3875, 'learning_rate': 2.2879379357060345e-05, 'epoch': 1.58} + + 53%|█████▎ | 2360/4476 [15:18:32<13:44:29, 23.38s/it] + 53%|█████▎ | 2361/4476 [15:18:56<13:43:29, 23.36s/it] + 53%|█████▎ | 2362/4476 [15:19:19<13:38:55, 23.24s/it] + 53%|█████▎ | 2363/4476 [15:19:42<13:41:34, 23.33s/it] + 53%|█████▎ | 2364/4476 [15:20:06<13:41:54, 23.35s/it] + 53%|█████▎ | 2365/4476 [15:20:29<13:41:18, 23.34s/it] + 53%|█████▎ | 2366/4476 [15:20:53<13:45:04, 23.46s/it] + 53%|█████▎ | 2367/4476 [15:21:16<13:42:11, 23.39s/it] + 53%|█████▎ | 2368/4476 [15:21:39<13:37:13, 23.26s/it] + 53%|█████▎ | 2369/4476 [15:22:02<13:38:09, 23.30s/it] + 53%|█████▎ | 2370/4476 [15:22:26<13:38:00, 23.31s/it] + +{'loss': 0.3884, 'learning_rate': 2.2704596678195827e-05, 'epoch': 1.59} + + 53%|█████▎ | 2370/4476 [15:22:26<13:38:00, 23.31s/it] + 53%|█████▎ | 2371/4476 [15:22:49<13:41:45, 23.42s/it] + 53%|█████▎ | 2372/4476 [15:23:13<13:40:04, 23.39s/it] + 53%|█████▎ | 2373/4476 [15:23:36<13:41:13, 23.43s/it] + 53%|█████▎ | 2374/4476 [15:24:00<13:42:08, 23.47s/it] + 53%|█████▎ | 2375/4476 [15:24:23<13:42:14, 23.48s/it] + 53%|█████▎ | 2376/4476 [15:24:46<13:39:10, 23.41s/it] + 53%|█████▎ | 2377/4476 [15:25:10<13:38:28, 23.40s/it] + 53%|█████▎ | 2378/4476 [15:25:33<13:39:24, 23.43s/it] + 53%|█████▎ | 2379/4476 [15:25:56<13:34:04, 23.29s/it] + 53%|█████▎ | 2380/4476 [15:26:20<13:33:07, 23.28s/it] + +{'loss': 0.381, 'learning_rate': 2.2529927076991283e-05, 'epoch': 1.59} + + 53%|█████▎ | 2380/4476 [15:26:20<13:33:07, 23.28s/it] + 53%|█████▎ | 2381/4476 [15:26:43<13:33:13, 23.29s/it] + 53%|█████▎ | 2382/4476 [15:27:06<13:29:29, 23.19s/it] + 53%|█████▎ | 2383/4476 [15:27:29<13:29:24, 23.20s/it] + 53%|█████▎ | 2384/4476 [15:27:52<13:28:33, 23.19s/it] + 53%|█████▎ | 2385/4476 [15:28:16<13:29:34, 23.23s/it] + 53%|█████▎ | 2386/4476 [15:28:39<13:27:59, 23.20s/it] + 53%|█████▎ | 2387/4476 [15:29:02<13:29:05, 23.24s/it] + 53%|█████▎ | 2388/4476 [15:29:25<13:30:16, 23.28s/it] + 53%|█████▎ | 2389/4476 [15:29:48<13:27:24, 23.21s/it] + 53%|█████▎ | 2390/4476 [15:30:12<13:28:22, 23.25s/it] + +{'loss': 0.3933, 'learning_rate': 2.2355379158134843e-05, 'epoch': 1.6} + + 53%|█████▎ | 2390/4476 [15:30:12<13:28:22, 23.25s/it] + 53%|█████▎ | 2391/4476 [15:30:35<13:25:17, 23.17s/it] + 53%|█████▎ | 2392/4476 [15:30:58<13:20:58, 23.06s/it] + 53%|█████▎ | 2393/4476 [15:31:21<13:21:39, 23.09s/it] + 53%|█████▎ | 2394/4476 [15:31:44<13:22:47, 23.14s/it] + 54%|█████▎ | 2395/4476 [15:32:07<13:22:40, 23.14s/it] + 54%|█████▎ | 2396/4476 [15:32:30<13:24:54, 23.22s/it] + 54%|█████▎ | 2397/4476 [15:32:54<13:26:36, 23.28s/it] + 54%|█████▎ | 2398/4476 [15:33:17<13:28:09, 23.33s/it] + 54%|█████▎ | 2399/4476 [15:33:41<13:27:24, 23.32s/it] + 54%|█████▎ | 2400/4476 [15:34:04<13:27:13, 23.33s/it] + +{'loss': 0.3853, 'learning_rate': 2.2180961520320278e-05, 'epoch': 1.61} + + 54%|█████▎ | 2400/4476 [15:34:04<13:27:13, 23.33s/it][INFO|trainer.py:2939] 2023-11-12 18:57:51,743 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2400 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 18:57:51,774 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 18:57:51,774 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2400/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 18:57:51,774 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2400/added_tokens.json + + 54%|█████▎ | 2401/4476 [15:34:27<13:24:41, 23.27s/it] + 54%|█████▎ | 2402/4476 [15:34:51<13:25:54, 23.31s/it] + 54%|█████▎ | 2403/4476 [15:35:14<13:27:17, 23.37s/it] + 54%|█████▎ | 2404/4476 [15:35:37<13:25:23, 23.32s/it] + 54%|█████▎ | 2405/4476 [15:36:00<13:20:45, 23.20s/it] + 54%|█████▍ | 2406/4476 [15:36:24<13:22:16, 23.25s/it] + 54%|█████▍ | 2407/4476 [15:36:47<13:26:15, 23.38s/it] + 54%|█████▍ | 2408/4476 [15:37:11<13:25:00, 23.36s/it] + 54%|█████▍ | 2409/4476 [15:37:34<13:25:43, 23.39s/it] + 54%|█████▍ | 2410/4476 [15:37:58<13:29:00, 23.49s/it] + +{'loss': 0.3871, 'learning_rate': 2.2006682755823367e-05, 'epoch': 1.62} + + 54%|█████▍ | 2410/4476 [15:37:58<13:29:00, 23.49s/it] + 54%|█████▍ | 2411/4476 [15:38:21<13:28:26, 23.49s/it] + 54%|█████▍ | 2412/4476 [15:38:44<13:25:12, 23.41s/it] + 54%|█████▍ | 2413/4476 [15:39:08<13:26:03, 23.44s/it] + 54%|█████▍ | 2414/4476 [15:39:31<13:23:11, 23.37s/it] + 54%|█████▍ | 2415/4476 [15:39:55<13:27:19, 23.50s/it] + 54%|█████▍ | 2416/4476 [15:40:18<13:22:33, 23.38s/it] + 54%|█████▍ | 2417/4476 [15:40:41<13:20:20, 23.32s/it] + 54%|█████▍ | 2418/4476 [15:41:05<13:24:37, 23.46s/it] + 54%|█████▍ | 2419/4476 [15:41:28<13:19:50, 23.33s/it] + 54%|█████▍ | 2420/4476 [15:41:51<13:18:30, 23.30s/it] + +{'loss': 0.3867, 'learning_rate': 2.1832551450078594e-05, 'epoch': 1.62} + + 54%|█████▍ | 2420/4476 [15:41:51<13:18:30, 23.30s/it] + 54%|█████▍ | 2421/4476 [15:42:14<13:16:35, 23.26s/it] + 54%|█████▍ | 2422/4476 [15:42:38<13:19:07, 23.34s/it] + 54%|█████▍ | 2423/4476 [15:43:02<13:23:14, 23.48s/it] + 54%|█████▍ | 2424/4476 [15:43:25<13:19:10, 23.37s/it] + 54%|█████▍ | 2425/4476 [15:43:48<13:17:43, 23.34s/it] + 54%|█████▍ | 2426/4476 [15:44:11<13:13:32, 23.23s/it] + 54%|█████▍ | 2427/4476 [15:44:34<13:13:24, 23.23s/it] + 54%|█████▍ | 2428/4476 [15:44:58<13:15:03, 23.29s/it] + 54%|█████▍ | 2429/4476 [15:45:21<13:18:26, 23.40s/it] + 54%|█████▍ | 2430/4476 [15:45:45<13:14:30, 23.30s/it] + +{'loss': 0.3878, 'learning_rate': 2.165857618125625e-05, 'epoch': 1.63} + + 54%|█████▍ | 2430/4476 [15:45:45<13:14:30, 23.30s/it] + 54%|█████▍ | 2431/4476 [15:46:08<13:11:51, 23.23s/it] + 54%|█████▍ | 2432/4476 [15:46:31<13:10:37, 23.21s/it] + 54%|█████▍ | 2433/4476 [15:46:54<13:12:55, 23.29s/it] + 54%|█████▍ | 2434/4476 [15:47:18<13:14:49, 23.35s/it] + 54%|█████▍ | 2435/4476 [15:47:41<13:14:47, 23.36s/it] + 54%|█████▍ | 2436/4476 [15:48:05<13:15:12, 23.39s/it] + 54%|█████▍ | 2437/4476 [15:48:28<13:11:16, 23.28s/it] + 54%|█████▍ | 2438/4476 [15:48:51<13:13:32, 23.36s/it] + 54%|█████▍ | 2439/4476 [15:49:15<13:16:58, 23.47s/it] + 55%|█████▍ | 2440/4476 [15:49:38<13:17:02, 23.49s/it] + +{'loss': 0.386, 'learning_rate': 2.1484765519839843e-05, 'epoch': 1.64} + + 55%|█████▍ | 2440/4476 [15:49:38<13:17:02, 23.49s/it] + 55%|█████▍ | 2441/4476 [15:50:02<13:14:49, 23.43s/it] + 55%|█████▍ | 2442/4476 [15:50:25<13:09:52, 23.30s/it] + 55%|█████▍ | 2443/4476 [15:50:48<13:09:15, 23.29s/it] + 55%|█████▍ | 2444/4476 [15:51:12<13:14:35, 23.46s/it] + 55%|█████▍ | 2445/4476 [15:51:35<13:15:02, 23.49s/it] + 55%|█████▍ | 2446/4476 [15:51:59<13:17:10, 23.56s/it] + 55%|█████▍ | 2447/4476 [15:52:22<13:13:31, 23.47s/it] + 55%|█████▍ | 2448/4476 [15:52:46<13:11:32, 23.42s/it] + 55%|█████▍ | 2449/4476 [15:53:09<13:11:20, 23.42s/it] + 55%|█████▍ | 2450/4476 [15:53:33<13:11:01, 23.43s/it] + +{'loss': 0.3836, 'learning_rate': 2.1311128028203863e-05, 'epoch': 1.64} + + 55%|█████▍ | 2450/4476 [15:53:33<13:11:01, 23.43s/it] + 55%|█████▍ | 2451/4476 [15:53:56<13:11:59, 23.47s/it] + 55%|█████▍ | 2452/4476 [15:54:20<13:14:27, 23.55s/it] + 55%|█████▍ | 2453/4476 [15:54:43<13:07:49, 23.37s/it] + 55%|█████▍ | 2454/4476 [15:55:06<13:05:51, 23.32s/it] + 55%|█████▍ | 2455/4476 [15:55:30<13:07:37, 23.38s/it] + 55%|█████▍ | 2456/4476 [15:55:53<13:10:52, 23.49s/it] + 55%|█████▍ | 2457/4476 [15:56:17<13:08:11, 23.42s/it] + 55%|█████▍ | 2458/4476 [15:56:40<13:06:10, 23.37s/it] + 55%|█████▍ | 2459/4476 [15:57:03<13:08:32, 23.46s/it] + 55%|█████▍ | 2460/4476 [15:57:27<13:06:49, 23.42s/it] + +{'loss': 0.3849, 'learning_rate': 2.1137672260192004e-05, 'epoch': 1.65} + + 55%|█████▍ | 2460/4476 [15:57:27<13:06:49, 23.42s/it] + 55%|█████▍ | 2461/4476 [15:57:50<13:07:40, 23.45s/it] + 55%|█████▌ | 2462/4476 [15:58:14<13:09:30, 23.52s/it] + 55%|█████▌ | 2463/4476 [15:58:37<13:04:49, 23.39s/it] + 55%|█████▌ | 2464/4476 [15:59:00<13:03:16, 23.36s/it] + 55%|█████▌ | 2465/4476 [15:59:24<13:03:56, 23.39s/it] + 55%|█████▌ | 2466/4476 [15:59:47<13:05:26, 23.45s/it] + 55%|█████▌ | 2467/4476 [16:00:11<13:05:59, 23.47s/it] + 55%|█████▌ | 2468/4476 [16:00:35<13:08:21, 23.56s/it] + 55%|█████▌ | 2469/4476 [16:00:58<13:07:48, 23.55s/it] + 55%|█████▌ | 2470/4476 [16:01:22<13:10:03, 23.63s/it] + +{'loss': 0.3841, 'learning_rate': 2.09644067606958e-05, 'epoch': 1.66} + + 55%|█████▌ | 2470/4476 [16:01:22<13:10:03, 23.63s/it] + 55%|█████▌ | 2471/4476 [16:01:45<13:05:56, 23.52s/it] + 55%|█████▌ | 2472/4476 [16:02:08<13:00:55, 23.38s/it] + 55%|█████▌ | 2473/4476 [16:02:32<13:03:30, 23.47s/it] + 55%|█████▌ | 2474/4476 [16:02:56<13:05:39, 23.55s/it] + 55%|█████▌ | 2475/4476 [16:03:19<13:04:25, 23.52s/it] + 55%|█████▌ | 2476/4476 [16:03:43<13:02:30, 23.48s/it] + 55%|█████▌ | 2477/4476 [16:04:06<13:00:36, 23.43s/it] + 55%|█████▌ | 2478/4476 [16:04:29<13:00:12, 23.43s/it] + 55%|█████▌ | 2479/4476 [16:04:53<12:59:21, 23.42s/it] + 55%|█████▌ | 2480/4476 [16:05:16<13:00:11, 23.45s/it] + +{'loss': 0.3848, 'learning_rate': 2.079134006523359e-05, 'epoch': 1.66} + + 55%|█████▌ | 2480/4476 [16:05:16<13:00:11, 23.45s/it] + 55%|█████▌ | 2481/4476 [16:05:40<13:02:13, 23.53s/it] + 55%|█████▌ | 2482/4476 [16:06:03<12:57:11, 23.39s/it] + 55%|█████▌ | 2483/4476 [16:06:26<12:57:12, 23.40s/it] + 55%|█████▌ | 2484/4476 [16:06:50<12:57:50, 23.43s/it] + 56%|█████▌ | 2485/4476 [16:07:13<12:55:40, 23.38s/it] + 56%|█████▌ | 2486/4476 [16:07:37<12:56:25, 23.41s/it] + 56%|█████▌ | 2487/4476 [16:08:00<12:53:19, 23.33s/it] + 56%|█████▌ | 2488/4476 [16:08:23<12:50:03, 23.24s/it] + 56%|█████▌ | 2489/4476 [16:08:47<12:54:29, 23.39s/it] + 56%|█████▌ | 2490/4476 [16:09:10<12:52:13, 23.33s/it] + +{'loss': 0.3817, 'learning_rate': 2.061848069953017e-05, 'epoch': 1.67} + + 56%|█████▌ | 2490/4476 [16:09:10<12:52:13, 23.33s/it] + 56%|█████▌ | 2491/4476 [16:09:34<12:56:20, 23.47s/it] + 56%|█████▌ | 2492/4476 [16:09:57<12:52:28, 23.36s/it] + 56%|█████▌ | 2493/4476 [16:10:20<12:48:26, 23.25s/it] + 56%|█████▌ | 2494/4476 [16:10:43<12:50:04, 23.31s/it] + 56%|█████▌ | 2495/4476 [16:11:06<12:49:58, 23.32s/it] + 56%|█████▌ | 2496/4476 [16:11:30<12:50:51, 23.36s/it] + 56%|█████▌ | 2497/4476 [16:11:53<12:50:36, 23.36s/it] + 56%|█████▌ | 2498/4476 [16:12:17<12:52:51, 23.44s/it] + 56%|█████▌ | 2499/4476 [16:12:40<12:49:20, 23.35s/it] + 56%|█████▌ | 2500/4476 [16:13:04<12:51:09, 23.42s/it] + +{'loss': 0.3839, 'learning_rate': 2.044583717909667e-05, 'epoch': 1.68} + + 56%|█████▌ | 2500/4476 [16:13:04<12:51:09, 23.42s/it] + 56%|█████▌ | 2501/4476 [16:13:27<12:48:56, 23.36s/it] + 56%|█████▌ | 2502/4476 [16:13:50<12:45:44, 23.27s/it] + 56%|█████▌ | 2503/4476 [16:14:13<12:45:20, 23.27s/it] + 56%|█████▌ | 2504/4476 [16:14:36<12:41:17, 23.16s/it] + 56%|█████▌ | 2505/4476 [16:14:59<12:42:38, 23.22s/it] + 56%|█████▌ | 2506/4476 [16:15:23<12:41:42, 23.20s/it] + 56%|█████▌ | 2507/4476 [16:15:46<12:44:20, 23.29s/it] + 56%|█████▌ | 2508/4476 [16:16:09<12:44:00, 23.29s/it] + 56%|█████▌ | 2509/4476 [16:16:33<12:41:52, 23.24s/it] + 56%|█████▌ | 2510/4476 [16:16:56<12:43:28, 23.30s/it] + +{'loss': 0.3749, 'learning_rate': 2.0273418008811125e-05, 'epoch': 1.68} + + 56%|█████▌ | 2510/4476 [16:16:56<12:43:28, 23.30s/it] + 56%|█████▌ | 2511/4476 [16:17:19<12:42:45, 23.29s/it] + 56%|█████▌ | 2512/4476 [16:17:43<12:42:34, 23.30s/it] + 56%|█████▌ | 2513/4476 [16:18:06<12:41:41, 23.28s/it] + 56%|█████▌ | 2514/4476 [16:18:29<12:39:52, 23.24s/it] + 56%|█████▌ | 2515/4476 [16:18:52<12:37:34, 23.18s/it] + 56%|█████▌ | 2516/4476 [16:19:15<12:38:53, 23.23s/it] + 56%|█████▌ | 2517/4476 [16:19:39<12:39:07, 23.25s/it] + 56%|█████▋ | 2518/4476 [16:20:02<12:38:48, 23.25s/it] + 56%|█████▋ | 2519/4476 [16:20:26<12:43:43, 23.42s/it] + 56%|█████▋ | 2520/4476 [16:20:49<12:42:08, 23.38s/it] + +{'loss': 0.3824, 'learning_rate': 2.0101231682499506e-05, 'epoch': 1.69} + + 56%|█████▋ | 2520/4476 [16:20:49<12:42:08, 23.38s/it] + 56%|█████▋ | 2521/4476 [16:21:12<12:42:28, 23.40s/it] + 56%|█████▋ | 2522/4476 [16:21:36<12:42:08, 23.40s/it] + 56%|█████▋ | 2523/4476 [16:21:59<12:40:35, 23.37s/it] + 56%|█████▋ | 2524/4476 [16:22:22<12:39:17, 23.34s/it] + 56%|█████▋ | 2525/4476 [16:22:46<12:37:53, 23.31s/it] + 56%|█████▋ | 2526/4476 [16:23:09<12:41:50, 23.44s/it] + 56%|█████▋ | 2527/4476 [16:23:33<12:39:32, 23.38s/it] + 56%|█████▋ | 2528/4476 [16:23:56<12:36:36, 23.30s/it] + 57%|█████▋ | 2529/4476 [16:24:19<12:36:27, 23.31s/it] + 57%|█████▋ | 2530/4476 [16:24:42<12:36:26, 23.32s/it] + +{'loss': 0.3854, 'learning_rate': 1.9929286682517235e-05, 'epoch': 1.7} + + 57%|█████▋ | 2530/4476 [16:24:42<12:36:26, 23.32s/it] + 57%|█████▋ | 2531/4476 [16:25:06<12:36:29, 23.34s/it] + 57%|█████▋ | 2532/4476 [16:25:29<12:38:15, 23.40s/it] + 57%|█████▋ | 2533/4476 [16:25:53<12:35:38, 23.33s/it] + 57%|█████▋ | 2534/4476 [16:26:16<12:36:24, 23.37s/it] + 57%|█████▋ | 2535/4476 [16:26:39<12:36:38, 23.39s/it] + 57%|█████▋ | 2536/4476 [16:27:03<12:37:25, 23.43s/it] + 57%|█████▋ | 2537/4476 [16:27:26<12:37:24, 23.44s/it] + 57%|█████▋ | 2538/4476 [16:27:50<12:35:40, 23.40s/it] + 57%|█████▋ | 2539/4476 [16:28:13<12:36:10, 23.42s/it] + 57%|█████▋ | 2540/4476 [16:28:37<12:36:05, 23.43s/it] + +{'loss': 0.3894, 'learning_rate': 1.9757591479331382e-05, 'epoch': 1.7} + + 57%|█████▋ | 2540/4476 [16:28:37<12:36:05, 23.43s/it] + 57%|█████▋ | 2541/4476 [16:29:00<12:34:59, 23.41s/it] + 57%|█████▋ | 2542/4476 [16:29:24<12:36:02, 23.46s/it] + 57%|█████▋ | 2543/4476 [16:29:47<12:33:53, 23.40s/it] + 57%|█████▋ | 2544/4476 [16:30:10<12:33:15, 23.39s/it] + 57%|█████▋ | 2545/4476 [16:30:33<12:30:37, 23.32s/it] + 57%|█████▋ | 2546/4476 [16:30:57<12:29:31, 23.30s/it] + 57%|█████▋ | 2547/4476 [16:31:20<12:29:40, 23.32s/it] + 57%|█████▋ | 2548/4476 [16:31:44<12:33:18, 23.44s/it] + 57%|█████▋ | 2549/4476 [16:32:07<12:33:49, 23.47s/it] + 57%|█████▋ | 2550/4476 [16:32:31<12:31:46, 23.42s/it] + +{'loss': 0.3871, 'learning_rate': 1.9586154531103373e-05, 'epoch': 1.71} + + 57%|█████▋ | 2550/4476 [16:32:31<12:31:46, 23.42s/it] + 57%|█████▋ | 2551/4476 [16:32:54<12:28:53, 23.34s/it] + 57%|█████▋ | 2552/4476 [16:33:17<12:26:47, 23.29s/it] + 57%|█████▋ | 2553/4476 [16:33:40<12:25:15, 23.25s/it] + 57%|█████▋ | 2554/4476 [16:34:03<12:25:20, 23.27s/it] + 57%|█████▋ | 2555/4476 [16:34:27<12:26:16, 23.31s/it] + 57%|█████▋ | 2556/4476 [16:34:50<12:27:47, 23.37s/it] + 57%|█████▋ | 2557/4476 [16:35:14<12:29:11, 23.42s/it] + 57%|█████▋ | 2558/4476 [16:35:37<12:23:41, 23.26s/it] + 57%|█████▋ | 2559/4476 [16:36:00<12:24:19, 23.30s/it] + 57%|█████▋ | 2560/4476 [16:36:23<12:22:37, 23.26s/it] + +{'loss': 0.382, 'learning_rate': 1.9414984283272286e-05, 'epoch': 1.72} + + 57%|█████▋ | 2560/4476 [16:36:23<12:22:37, 23.26s/it] + 57%|█████▋ | 2561/4476 [16:36:46<12:20:31, 23.20s/it] + 57%|█████▋ | 2562/4476 [16:37:10<12:20:24, 23.21s/it] + 57%|█████▋ | 2563/4476 [16:37:33<12:20:54, 23.24s/it] + 57%|█████▋ | 2564/4476 [16:37:56<12:17:12, 23.13s/it] + 57%|█████▋ | 2565/4476 [16:38:19<12:16:52, 23.14s/it] + 57%|█████▋ | 2566/4476 [16:38:42<12:20:25, 23.26s/it] + 57%|█████▋ | 2567/4476 [16:39:06<12:19:46, 23.25s/it] + 57%|█████▋ | 2568/4476 [16:39:29<12:19:49, 23.26s/it] + 57%|█████▋ | 2569/4476 [16:39:52<12:19:23, 23.26s/it] + 57%|█████▋ | 2570/4476 [16:40:16<12:22:24, 23.37s/it] + +{'loss': 0.3858, 'learning_rate': 1.9244089168138836e-05, 'epoch': 1.72} + + 57%|█████▋ | 2570/4476 [16:40:16<12:22:24, 23.37s/it] + 57%|█████▋ | 2571/4476 [16:40:39<12:18:34, 23.26s/it] + 57%|█████▋ | 2572/4476 [16:41:02<12:18:41, 23.28s/it] + 57%|█████▋ | 2573/4476 [16:41:25<12:18:27, 23.28s/it] + 58%|█████▊ | 2574/4476 [16:41:49<12:18:10, 23.29s/it] + 58%|█████▊ | 2575/4476 [16:42:12<12:19:49, 23.35s/it] + 58%|█████▊ | 2576/4476 [16:42:35<12:16:38, 23.26s/it] + 58%|█████▊ | 2577/4476 [16:42:59<12:17:44, 23.31s/it] + 58%|█████▊ | 2578/4476 [16:43:22<12:18:58, 23.36s/it] + 58%|█████▊ | 2579/4476 [16:43:46<12:20:30, 23.42s/it] + 58%|█████▊ | 2580/4476 [16:44:09<12:16:57, 23.32s/it] + +{'loss': 0.3819, 'learning_rate': 1.9073477604449985e-05, 'epoch': 1.73} + + 58%|█████▊ | 2580/4476 [16:44:09<12:16:57, 23.32s/it] + 58%|█████▊ | 2581/4476 [16:44:32<12:17:55, 23.36s/it] + 58%|█████▊ | 2582/4476 [16:44:56<12:20:27, 23.46s/it] + 58%|█████▊ | 2583/4476 [16:45:19<12:20:20, 23.47s/it] + 58%|█████▊ | 2584/4476 [16:45:43<12:18:26, 23.42s/it] + 58%|█████▊ | 2585/4476 [16:46:06<12:16:42, 23.37s/it] + 58%|█████▊ | 2586/4476 [16:46:29<12:15:47, 23.36s/it] + 58%|█████▊ | 2587/4476 [16:46:52<12:13:19, 23.29s/it] + 58%|█████▊ | 2588/4476 [16:47:16<12:16:22, 23.40s/it] + 58%|█████▊ | 2589/4476 [16:47:40<12:19:35, 23.52s/it] + 58%|█████▊ | 2590/4476 [16:48:03<12:16:57, 23.45s/it] + +{'loss': 0.3795, 'learning_rate': 1.8903157996984174e-05, 'epoch': 1.74} + + 58%|█████▊ | 2590/4476 [16:48:03<12:16:57, 23.45s/it] + 58%|█████▊ | 2591/4476 [16:48:26<12:14:34, 23.38s/it] + 58%|█████▊ | 2592/4476 [16:48:50<12:13:19, 23.35s/it] + 58%|█████▊ | 2593/4476 [16:49:13<12:11:32, 23.31s/it] + 58%|█████▊ | 2594/4476 [16:49:36<12:10:31, 23.29s/it] + 58%|█████▊ | 2595/4476 [16:50:00<12:11:05, 23.32s/it] + 58%|█████▊ | 2596/4476 [16:50:23<12:10:38, 23.32s/it] + 58%|█████▊ | 2597/4476 [16:50:46<12:12:33, 23.39s/it] + 58%|█████▊ | 2598/4476 [16:51:10<12:15:47, 23.51s/it] + 58%|█████▊ | 2599/4476 [16:51:34<12:17:16, 23.57s/it] + 58%|█████▊ | 2600/4476 [16:51:57<12:16:16, 23.55s/it] + +{'loss': 0.3827, 'learning_rate': 1.873313873613733e-05, 'epoch': 1.74} + + 58%|█████▊ | 2600/4476 [16:51:57<12:16:16, 23.55s/it][INFO|trainer.py:2939] 2023-11-12 20:15:45,179 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2600 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 20:15:45,210 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2600/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 20:15:45,210 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2600/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 20:15:45,211 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2600/added_tokens.json + + 58%|█████▊ | 2601/4476 [16:52:21<12:14:11, 23.49s/it] + 58%|█████▊ | 2602/4476 [16:52:44<12:09:45, 23.36s/it] + 58%|█████▊ | 2603/4476 [16:53:07<12:08:39, 23.34s/it] + 58%|█████▊ | 2604/4476 [16:53:31<12:09:10, 23.37s/it] + 58%|█████▊ | 2605/4476 [16:53:54<12:08:12, 23.35s/it] + 58%|█████▊ | 2606/4476 [16:54:17<12:04:58, 23.26s/it] + 58%|█████▊ | 2607/4476 [16:54:40<12:05:39, 23.30s/it] + 58%|█████▊ | 2608/4476 [16:55:04<12:07:24, 23.36s/it] + 58%|█████▊ | 2609/4476 [16:55:27<12:08:20, 23.41s/it] + 58%|█████▊ | 2610/4476 [16:55:51<12:09:13, 23.45s/it] + +{'loss': 0.385, 'learning_rate': 1.8563428197509502e-05, 'epoch': 1.75} + + 58%|█████▊ | 2610/4476 [16:55:51<12:09:13, 23.45s/it] + 58%|█████▊ | 2611/4476 [16:56:14<12:07:40, 23.41s/it] + 58%|█████▊ | 2612/4476 [16:56:37<12:05:56, 23.37s/it] + 58%|█████▊ | 2613/4476 [16:57:01<12:06:02, 23.38s/it] + 58%|█████▊ | 2614/4476 [16:57:24<12:03:48, 23.32s/it] + 58%|█████▊ | 2615/4476 [16:57:47<12:03:04, 23.31s/it] + 58%|█████▊ | 2616/4476 [16:58:11<12:01:35, 23.28s/it] + 58%|█████▊ | 2617/4476 [16:58:34<12:00:34, 23.26s/it] + 58%|█████▊ | 2618/4476 [16:58:57<12:02:52, 23.34s/it] + 59%|█████▊ | 2619/4476 [16:59:20<12:00:11, 23.27s/it] + 59%|█████▊ | 2620/4476 [16:59:44<12:02:01, 23.34s/it] + +{'loss': 0.3846, 'learning_rate': 1.839403474149225e-05, 'epoch': 1.76} + + 59%|█████▊ | 2620/4476 [16:59:44<12:02:01, 23.34s/it] + 59%|█████▊ | 2621/4476 [17:00:07<12:01:01, 23.32s/it] + 59%|█████▊ | 2622/4476 [17:00:31<12:01:31, 23.35s/it] + 59%|█████▊ | 2623/4476 [17:00:54<11:59:02, 23.28s/it] + 59%|█████▊ | 2624/4476 [17:01:17<12:00:20, 23.34s/it] + 59%|█████▊ | 2625/4476 [17:01:40<11:56:51, 23.24s/it] + 59%|█████▊ | 2626/4476 [17:02:03<11:56:40, 23.24s/it] + 59%|█████▊ | 2627/4476 [17:02:27<11:56:48, 23.26s/it] + 59%|█████▊ | 2628/4476 [17:02:50<11:58:10, 23.32s/it] + 59%|█████▊ | 2629/4476 [17:03:14<11:59:59, 23.39s/it] + 59%|█████▉ | 2630/4476 [17:03:37<11:59:52, 23.40s/it] + +{'loss': 0.3828, 'learning_rate': 1.8224966712856806e-05, 'epoch': 1.76} + + 59%|█████▉ | 2630/4476 [17:03:37<11:59:52, 23.40s/it] + 59%|█████▉ | 2631/4476 [17:04:00<11:56:32, 23.30s/it] + 59%|█████▉ | 2632/4476 [17:04:24<11:58:09, 23.37s/it] + 59%|█████▉ | 2633/4476 [17:04:47<11:56:35, 23.33s/it] + 59%|█████▉ | 2634/4476 [17:05:10<11:53:12, 23.23s/it] + 59%|█████▉ | 2635/4476 [17:05:33<11:51:25, 23.19s/it] + 59%|█████▉ | 2636/4476 [17:05:56<11:51:02, 23.19s/it] + 59%|█████▉ | 2637/4476 [17:06:19<11:50:17, 23.17s/it] + 59%|█████▉ | 2638/4476 [17:06:43<11:50:19, 23.19s/it] + 59%|█████▉ | 2639/4476 [17:07:06<11:51:13, 23.23s/it] + 59%|█████▉ | 2640/4476 [17:07:29<11:52:49, 23.30s/it] + +{'loss': 0.3792, 'learning_rate': 1.8056232440343013e-05, 'epoch': 1.77} + + 59%|█████▉ | 2640/4476 [17:07:29<11:52:49, 23.30s/it] + 59%|█████▉ | 2641/4476 [17:07:53<11:51:19, 23.26s/it] + 59%|█████▉ | 2642/4476 [17:08:16<11:53:37, 23.35s/it] + 59%|█████▉ | 2643/4476 [17:08:39<11:51:05, 23.28s/it] + 59%|█████▉ | 2644/4476 [17:09:02<11:48:42, 23.21s/it] + 59%|█████▉ | 2645/4476 [17:09:26<11:48:18, 23.21s/it] + 59%|█████▉ | 2646/4476 [17:09:49<11:49:38, 23.27s/it] + 59%|█████▉ | 2647/4476 [17:10:13<11:52:58, 23.39s/it] + 59%|█████▉ | 2648/4476 [17:10:36<11:52:46, 23.40s/it] + 59%|█████▉ | 2649/4476 [17:10:59<11:49:03, 23.29s/it] + 59%|█████▉ | 2650/4476 [17:11:22<11:47:51, 23.26s/it] + +{'loss': 0.3814, 'learning_rate': 1.788784023624896e-05, 'epoch': 1.78} + + 59%|█████▉ | 2650/4476 [17:11:22<11:47:51, 23.26s/it] + 59%|█████▉ | 2651/4476 [17:11:45<11:46:36, 23.23s/it] + 59%|█████▉ | 2652/4476 [17:12:09<11:48:01, 23.29s/it] + 59%|█████▉ | 2653/4476 [17:12:32<11:48:29, 23.32s/it] + 59%|█████▉ | 2654/4476 [17:12:55<11:47:22, 23.29s/it] + 59%|█████▉ | 2655/4476 [17:13:19<11:48:55, 23.36s/it] + 59%|█████▉ | 2656/4476 [17:13:42<11:48:02, 23.34s/it] + 59%|█████▉ | 2657/4476 [17:14:06<11:46:34, 23.31s/it] + 59%|█████▉ | 2658/4476 [17:14:29<11:45:49, 23.29s/it] + 59%|█████▉ | 2659/4476 [17:14:52<11:43:06, 23.22s/it] + 59%|█████▉ | 2660/4476 [17:15:15<11:46:31, 23.34s/it] + +{'loss': 0.3819, 'learning_rate': 1.7719798396021558e-05, 'epoch': 1.78} + + 59%|█████▉ | 2660/4476 [17:15:15<11:46:31, 23.34s/it] + 59%|█████▉ | 2661/4476 [17:15:39<11:45:43, 23.33s/it] + 59%|█████▉ | 2662/4476 [17:16:02<11:45:35, 23.34s/it] + 59%|█████▉ | 2663/4476 [17:16:26<11:47:28, 23.41s/it] + 60%|█████▉ | 2664/4476 [17:16:49<11:47:11, 23.42s/it] + 60%|█████▉ | 2665/4476 [17:17:12<11:45:53, 23.39s/it] + 60%|█████▉ | 2666/4476 [17:17:36<11:44:06, 23.34s/it] + 60%|█████▉ | 2667/4476 [17:17:59<11:42:53, 23.31s/it] + 60%|█████▉ | 2668/4476 [17:18:23<11:45:55, 23.43s/it] + 60%|█████▉ | 2669/4476 [17:18:46<11:46:40, 23.46s/it] + 60%|█████▉ | 2670/4476 [17:19:10<11:45:41, 23.45s/it] + +{'loss': 0.3798, 'learning_rate': 1.7552115197847884e-05, 'epoch': 1.79} + + 60%|█████▉ | 2670/4476 [17:19:10<11:45:41, 23.45s/it] + 60%|█████▉ | 2671/4476 [17:19:33<11:45:25, 23.45s/it] + 60%|█████▉ | 2672/4476 [17:19:56<11:43:22, 23.39s/it] + 60%|█████▉ | 2673/4476 [17:20:20<11:43:01, 23.40s/it] + 60%|█████▉ | 2674/4476 [17:20:43<11:41:21, 23.35s/it] + 60%|█████▉ | 2675/4476 [17:21:06<11:41:51, 23.38s/it] + 60%|█████▉ | 2676/4476 [17:21:30<11:42:58, 23.43s/it] + 60%|█████▉ | 2677/4476 [17:21:53<11:40:23, 23.36s/it] + 60%|█████▉ | 2678/4476 [17:22:16<11:38:25, 23.31s/it] + 60%|█████▉ | 2679/4476 [17:22:40<11:38:28, 23.32s/it] + 60%|█████▉ | 2680/4476 [17:23:03<11:35:37, 23.24s/it] + +{'loss': 0.3772, 'learning_rate': 1.7384798902247316e-05, 'epoch': 1.8} + + 60%|█████▉ | 2680/4476 [17:23:03<11:35:37, 23.24s/it] + 60%|█████▉ | 2681/4476 [17:23:26<11:34:05, 23.20s/it] + 60%|█████▉ | 2682/4476 [17:23:49<11:37:00, 23.31s/it] + 60%|█████▉ | 2683/4476 [17:24:13<11:36:38, 23.31s/it] + 60%|█████▉ | 2684/4476 [17:24:36<11:34:42, 23.26s/it] + 60%|█████▉ | 2685/4476 [17:25:00<11:38:18, 23.39s/it] + 60%|██████ | 2686/4476 [17:25:23<11:36:55, 23.36s/it] + 60%|██████ | 2687/4476 [17:25:46<11:34:43, 23.30s/it] + 60%|██████ | 2688/4476 [17:26:10<11:36:31, 23.37s/it] + 60%|██████ | 2689/4476 [17:26:33<11:35:25, 23.35s/it] + 60%|██████ | 2690/4476 [17:26:56<11:30:16, 23.19s/it] + +{'loss': 0.3758, 'learning_rate': 1.7217857751664663e-05, 'epoch': 1.8} + + 60%|██████ | 2690/4476 [17:26:56<11:30:16, 23.19s/it] + 60%|██████ | 2691/4476 [17:27:19<11:30:40, 23.22s/it] + 60%|██████ | 2692/4476 [17:27:42<11:32:31, 23.29s/it] + 60%|██████ | 2693/4476 [17:28:06<11:34:09, 23.36s/it] + 60%|██████ | 2694/4476 [17:28:29<11:34:28, 23.38s/it] + 60%|██████ | 2695/4476 [17:28:53<11:35:49, 23.44s/it] + 60%|██████ | 2696/4476 [17:29:17<11:36:39, 23.48s/it] + 60%|██████ | 2697/4476 [17:29:40<11:38:27, 23.56s/it] + 60%|██████ | 2698/4476 [17:30:04<11:36:56, 23.52s/it] + 60%|██████ | 2699/4476 [17:30:27<11:34:09, 23.44s/it] + 60%|██████ | 2700/4476 [17:30:51<11:35:49, 23.51s/it] + +{'loss': 0.3706, 'learning_rate': 1.7051299970064098e-05, 'epoch': 1.81} + + 60%|██████ | 2700/4476 [17:30:51<11:35:49, 23.51s/it] + 60%|██████ | 2701/4476 [17:31:14<11:37:22, 23.57s/it] + 60%|██████ | 2702/4476 [17:31:38<11:35:36, 23.53s/it] + 60%|██████ | 2703/4476 [17:32:01<11:35:34, 23.54s/it] + 60%|██████ | 2704/4476 [17:32:24<11:31:45, 23.42s/it] + 60%|██████ | 2705/4476 [17:32:48<11:32:31, 23.46s/it] + 60%|██████ | 2706/4476 [17:33:12<11:35:05, 23.56s/it] + 60%|██████ | 2707/4476 [17:33:35<11:32:22, 23.48s/it] + 61%|██████ | 2708/4476 [17:33:58<11:28:27, 23.36s/it] + 61%|██████ | 2709/4476 [17:34:22<11:29:44, 23.42s/it] + 61%|██████ | 2710/4476 [17:34:45<11:29:57, 23.44s/it] + +{'loss': 0.3805, 'learning_rate': 1.6885133762523985e-05, 'epoch': 1.82} + + 61%|██████ | 2710/4476 [17:34:45<11:29:57, 23.44s/it] + 61%|██████ | 2711/4476 [17:35:09<11:29:50, 23.45s/it] + 61%|██████ | 2712/4476 [17:35:32<11:29:58, 23.47s/it] + 61%|██████ | 2713/4476 [17:35:56<11:30:45, 23.51s/it] + 61%|██████ | 2714/4476 [17:36:19<11:28:27, 23.44s/it] + 61%|██████ | 2715/4476 [17:36:42<11:27:23, 23.42s/it] + 61%|██████ | 2716/4476 [17:37:06<11:29:57, 23.52s/it] + 61%|██████ | 2717/4476 [17:37:30<11:28:27, 23.48s/it] + 61%|██████ | 2718/4476 [17:37:53<11:26:34, 23.43s/it] + 61%|██████ | 2719/4476 [17:38:16<11:23:48, 23.35s/it] + 61%|██████ | 2720/4476 [17:38:39<11:22:14, 23.31s/it] + +{'loss': 0.3892, 'learning_rate': 1.6719367314832756e-05, 'epoch': 1.82} + + 61%|██████ | 2720/4476 [17:38:39<11:22:14, 23.31s/it] + 61%|██████ | 2721/4476 [17:39:03<11:22:18, 23.33s/it] + 61%|██████ | 2722/4476 [17:39:26<11:22:17, 23.34s/it] + 61%|██████ | 2723/4476 [17:39:49<11:21:49, 23.34s/it] + 61%|██████ | 2724/4476 [17:40:13<11:25:21, 23.47s/it] + 61%|██████ | 2725/4476 [17:40:37<11:25:24, 23.49s/it] + 61%|██████ | 2726/4476 [17:41:00<11:24:25, 23.47s/it] + 61%|██████ | 2727/4476 [17:41:24<11:24:20, 23.48s/it] + 61%|██████ | 2728/4476 [17:41:47<11:22:45, 23.44s/it] + 61%|██████ | 2729/4476 [17:42:10<11:18:45, 23.31s/it] + 61%|██████ | 2730/4476 [17:42:33<11:16:22, 23.24s/it] + +{'loss': 0.387, 'learning_rate': 1.65540087930856e-05, 'epoch': 1.83} + + 61%|██████ | 2730/4476 [17:42:33<11:16:22, 23.24s/it] + 61%|██████ | 2731/4476 [17:42:56<11:14:39, 23.20s/it] + 61%|██████ | 2732/4476 [17:43:19<11:15:02, 23.22s/it] + 61%|██████ | 2733/4476 [17:43:43<11:17:02, 23.31s/it] + 61%|██████ | 2734/4476 [17:44:06<11:18:31, 23.37s/it] + 61%|██████ | 2735/4476 [17:44:30<11:17:44, 23.36s/it] + 61%|██████ | 2736/4476 [17:44:53<11:18:18, 23.39s/it] + 61%|██████ | 2737/4476 [17:45:17<11:19:08, 23.43s/it] + 61%|██████ | 2738/4476 [17:45:40<11:15:51, 23.33s/it] + 61%|██████ | 2739/4476 [17:46:03<11:15:07, 23.32s/it] + 61%|██████ | 2740/4476 [17:46:26<11:13:02, 23.26s/it] + +{'loss': 0.3773, 'learning_rate': 1.6389066343282168e-05, 'epoch': 1.84} + + 61%|██████ | 2740/4476 [17:46:26<11:13:02, 23.26s/it] + 61%|██████ | 2741/4476 [17:46:50<11:14:42, 23.33s/it] + 61%|██████▏ | 2742/4476 [17:47:13<11:13:50, 23.32s/it] + 61%|██████▏ | 2743/4476 [17:47:36<11:10:40, 23.22s/it] + 61%|██████▏ | 2744/4476 [17:48:00<11:12:55, 23.31s/it] + 61%|██████▏ | 2745/4476 [17:48:23<11:12:36, 23.31s/it] + 61%|██████▏ | 2746/4476 [17:48:46<11:13:46, 23.37s/it] + 61%|██████▏ | 2747/4476 [17:49:10<11:13:52, 23.38s/it] + 61%|██████▏ | 2748/4476 [17:49:33<11:12:32, 23.35s/it] + 61%|██████▏ | 2749/4476 [17:49:56<11:09:34, 23.26s/it] + 61%|██████▏ | 2750/4476 [17:50:20<11:10:03, 23.29s/it] + +{'loss': 0.3829, 'learning_rate': 1.6224548090925323e-05, 'epoch': 1.84} + + 61%|██████▏ | 2750/4476 [17:50:20<11:10:03, 23.29s/it] + 61%|██████▏ | 2751/4476 [17:50:43<11:07:22, 23.21s/it] + 61%|██████▏ | 2752/4476 [17:51:06<11:06:54, 23.21s/it] + 62%|██████▏ | 2753/4476 [17:51:29<11:09:33, 23.32s/it] + 62%|██████▏ | 2754/4476 [17:51:53<11:08:52, 23.31s/it] + 62%|██████▏ | 2755/4476 [17:52:16<11:06:31, 23.24s/it] + 62%|██████▏ | 2756/4476 [17:52:39<11:06:17, 23.24s/it] + 62%|██████▏ | 2757/4476 [17:53:02<11:05:01, 23.21s/it] + 62%|██████▏ | 2758/4476 [17:53:25<11:01:31, 23.10s/it] + 62%|██████▏ | 2759/4476 [17:53:48<10:58:51, 23.02s/it] + 62%|██████▏ | 2760/4476 [17:54:11<11:01:33, 23.13s/it] + +{'loss': 0.3697, 'learning_rate': 1.6060462140620835e-05, 'epoch': 1.85} + + 62%|██████▏ | 2760/4476 [17:54:11<11:01:33, 23.13s/it] + 62%|██████▏ | 2761/4476 [17:54:34<11:02:42, 23.19s/it] + 62%|██████▏ | 2762/4476 [17:54:57<11:00:58, 23.14s/it] + 62%|██████▏ | 2763/4476 [17:55:21<11:02:05, 23.19s/it] + 62%|██████▏ | 2764/4476 [17:55:44<11:04:38, 23.29s/it] + 62%|██████▏ | 2765/4476 [17:56:08<11:05:21, 23.33s/it] + 62%|██████▏ | 2766/4476 [17:56:31<11:04:31, 23.32s/it] + 62%|██████▏ | 2767/4476 [17:56:54<11:02:38, 23.26s/it] + 62%|██████▏ | 2768/4476 [17:57:18<11:04:04, 23.33s/it] + 62%|██████▏ | 2769/4476 [17:57:41<11:05:14, 23.38s/it] + 62%|██████▏ | 2770/4476 [17:58:04<11:03:42, 23.34s/it] + +{'loss': 0.3817, 'learning_rate': 1.589681657567811e-05, 'epoch': 1.86} + + 62%|██████▏ | 2770/4476 [17:58:04<11:03:42, 23.34s/it] + 62%|██████▏ | 2771/4476 [17:58:28<11:06:59, 23.47s/it] + 62%|██████▏ | 2772/4476 [17:58:51<11:03:35, 23.37s/it] + 62%|██████▏ | 2773/4476 [17:59:15<11:04:28, 23.41s/it] + 62%|██████▏ | 2774/4476 [17:59:38<11:02:38, 23.36s/it] + 62%|██████▏ | 2775/4476 [18:00:01<11:00:49, 23.31s/it] + 62%|██████▏ | 2776/4476 [18:00:25<11:01:14, 23.34s/it] + 62%|██████▏ | 2777/4476 [18:00:48<10:59:24, 23.29s/it] + 62%|██████▏ | 2778/4476 [18:01:11<10:59:16, 23.30s/it] + 62%|██████▏ | 2779/4476 [18:01:34<10:58:24, 23.28s/it] + 62%|██████▏ | 2780/4476 [18:01:58<10:59:39, 23.34s/it] + +{'loss': 0.3819, 'learning_rate': 1.5733619457712037e-05, 'epoch': 1.86} + + 62%|██████▏ | 2780/4476 [18:01:58<10:59:39, 23.34s/it] + 62%|██████▏ | 2781/4476 [18:02:21<10:56:33, 23.24s/it] + 62%|██████▏ | 2782/4476 [18:02:44<10:58:02, 23.31s/it] + 62%|██████▏ | 2783/4476 [18:03:08<11:01:14, 23.43s/it] + 62%|██████▏ | 2784/4476 [18:03:31<10:58:30, 23.35s/it] + 62%|██████▏ | 2785/4476 [18:03:55<10:58:16, 23.36s/it] + 62%|██████▏ | 2786/4476 [18:04:18<10:55:33, 23.27s/it] + 62%|██████▏ | 2787/4476 [18:04:41<10:59:25, 23.43s/it] + 62%|██████▏ | 2788/4476 [18:05:05<10:57:58, 23.39s/it] + 62%|██████▏ | 2789/4476 [18:05:28<10:57:57, 23.40s/it] + 62%|██████▏ | 2790/4476 [18:05:52<10:57:12, 23.39s/it] + +{'loss': 0.3756, 'learning_rate': 1.5570878826245773e-05, 'epoch': 1.87} + + 62%|██████▏ | 2790/4476 [18:05:52<10:57:12, 23.39s/it] + 62%|██████▏ | 2791/4476 [18:06:14<10:50:38, 23.17s/it] + 62%|██████▏ | 2792/4476 [18:06:37<10:50:58, 23.19s/it] + 62%|██████▏ | 2793/4476 [18:07:01<10:50:14, 23.18s/it] + 62%|██████▏ | 2794/4476 [18:07:24<10:52:29, 23.28s/it] + 62%|██████▏ | 2795/4476 [18:07:47<10:51:31, 23.26s/it] + 62%|██████▏ | 2796/4476 [18:08:10<10:49:37, 23.20s/it] + 62%|██████▏ | 2797/4476 [18:08:34<10:50:51, 23.26s/it] + 63%|██████▎ | 2798/4476 [18:08:57<10:52:42, 23.34s/it] + 63%|██████▎ | 2799/4476 [18:09:21<10:51:10, 23.30s/it] + 63%|██████▎ | 2800/4476 [18:09:44<10:50:19, 23.28s/it] + +{'loss': 0.3791, 'learning_rate': 1.5408602698314777e-05, 'epoch': 1.88} + + 63%|██████▎ | 2800/4476 [18:09:44<10:50:19, 23.28s/it][INFO|trainer.py:2939] 2023-11-12 21:33:31,487 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2800 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 21:33:31,519 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2800/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 21:33:31,519 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2800/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 21:33:31,519 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-2800/added_tokens.json + + 63%|██████▎ | 2801/4476 [18:10:07<10:48:08, 23.22s/it] + 63%|██████▎ | 2802/4476 [18:10:30<10:51:18, 23.34s/it] + 63%|██████▎ | 2803/4476 [18:10:54<10:50:13, 23.32s/it] + 63%|██████▎ | 2804/4476 [18:11:17<10:50:06, 23.33s/it] + 63%|██████▎ | 2805/4476 [18:11:40<10:48:07, 23.27s/it] + 63%|██████▎ | 2806/4476 [18:12:04<10:47:55, 23.28s/it] + 63%|██████▎ | 2807/4476 [18:12:27<10:48:10, 23.30s/it] + 63%|██████▎ | 2808/4476 [18:12:50<10:48:00, 23.31s/it] + 63%|██████▎ | 2809/4476 [18:13:13<10:45:05, 23.22s/it] + 63%|██████▎ | 2810/4476 [18:13:37<10:45:38, 23.25s/it] + +{'loss': 0.3765, 'learning_rate': 1.5246799068071818e-05, 'epoch': 1.88} + + 63%|██████▎ | 2810/4476 [18:13:37<10:45:38, 23.25s/it] + 63%|██████▎ | 2811/4476 [18:14:00<10:45:41, 23.27s/it] + 63%|██████▎ | 2812/4476 [18:14:23<10:46:59, 23.33s/it] + 63%|██████▎ | 2813/4476 [18:14:47<10:48:31, 23.40s/it] + 63%|██████▎ | 2814/4476 [18:15:10<10:48:41, 23.42s/it] + 63%|██████▎ | 2815/4476 [18:15:33<10:45:19, 23.31s/it] + 63%|██████▎ | 2816/4476 [18:15:57<10:44:44, 23.30s/it] + 63%|██████▎ | 2817/4476 [18:16:20<10:46:07, 23.37s/it] + 63%|██████▎ | 2818/4476 [18:16:44<10:46:07, 23.38s/it] + 63%|██████▎ | 2819/4476 [18:17:07<10:44:38, 23.34s/it] + 63%|██████▎ | 2820/4476 [18:17:31<10:47:17, 23.45s/it] + +{'loss': 0.3834, 'learning_rate': 1.5085475906393153e-05, 'epoch': 1.89} + + 63%|██████▎ | 2820/4476 [18:17:31<10:47:17, 23.45s/it] + 63%|██████▎ | 2821/4476 [18:17:54<10:47:48, 23.49s/it] + 63%|██████▎ | 2822/4476 [18:18:18<10:48:55, 23.54s/it] + 63%|██████▎ | 2823/4476 [18:18:41<10:45:16, 23.42s/it] + 63%|██████▎ | 2824/4476 [18:19:04<10:44:44, 23.42s/it] + 63%|██████▎ | 2825/4476 [18:19:28<10:45:19, 23.45s/it] + 63%|██████▎ | 2826/4476 [18:19:51<10:40:18, 23.28s/it] + 63%|██████▎ | 2827/4476 [18:20:14<10:40:16, 23.30s/it] + 63%|██████▎ | 2828/4476 [18:20:37<10:40:34, 23.32s/it] + 63%|██████▎ | 2829/4476 [18:21:01<10:39:24, 23.29s/it] + 63%|██████▎ | 2830/4476 [18:21:24<10:37:34, 23.24s/it] + +{'loss': 0.381, 'learning_rate': 1.4924641160485923e-05, 'epoch': 1.9} + + 63%|██████▎ | 2830/4476 [18:21:24<10:37:34, 23.24s/it] + 63%|██████▎ | 2831/4476 [18:21:47<10:38:21, 23.28s/it] + 63%|██████▎ | 2832/4476 [18:22:11<10:41:40, 23.42s/it] + 63%|██████▎ | 2833/4476 [18:22:34<10:41:30, 23.43s/it] + 63%|██████▎ | 2834/4476 [18:22:58<10:42:41, 23.48s/it] + 63%|██████▎ | 2835/4476 [18:23:21<10:39:07, 23.37s/it] + 63%|██████▎ | 2836/4476 [18:23:44<10:38:35, 23.36s/it] + 63%|██████▎ | 2837/4476 [18:24:07<10:33:17, 23.18s/it] + 63%|██████▎ | 2838/4476 [18:24:31<10:33:58, 23.22s/it] + 63%|██████▎ | 2839/4476 [18:24:54<10:35:14, 23.28s/it] + 63%|██████▎ | 2840/4476 [18:25:17<10:35:59, 23.33s/it] + +{'loss': 0.3766, 'learning_rate': 1.4764302753496584e-05, 'epoch': 1.9} + + 63%|██████▎ | 2840/4476 [18:25:17<10:35:59, 23.33s/it] + 63%|██████▎ | 2841/4476 [18:25:40<10:32:07, 23.20s/it] + 63%|██████▎ | 2842/4476 [18:26:04<10:32:07, 23.21s/it] + 64%|██████▎ | 2843/4476 [18:26:27<10:33:38, 23.28s/it] + 64%|██████▎ | 2844/4476 [18:26:50<10:30:24, 23.18s/it] + 64%|██████▎ | 2845/4476 [18:27:13<10:30:30, 23.19s/it] + 64%|██████▎ | 2846/4476 [18:27:36<10:30:26, 23.21s/it] + 64%|██████▎ | 2847/4476 [18:28:00<10:29:47, 23.20s/it] + 64%|██████▎ | 2848/4476 [18:28:23<10:28:20, 23.16s/it] + 64%|██████▎ | 2849/4476 [18:28:46<10:27:23, 23.14s/it] + 64%|██████▎ | 2850/4476 [18:29:09<10:27:19, 23.15s/it] + +{'loss': 0.3815, 'learning_rate': 1.4604468584120607e-05, 'epoch': 1.91} + + 64%|██████▎ | 2850/4476 [18:29:09<10:27:19, 23.15s/it] + 64%|██████▎ | 2851/4476 [18:29:32<10:28:30, 23.21s/it] + 64%|██████▎ | 2852/4476 [18:29:55<10:27:24, 23.18s/it] + 64%|██████▎ | 2853/4476 [18:30:19<10:26:54, 23.18s/it] + 64%|██████▍ | 2854/4476 [18:30:42<10:27:51, 23.23s/it] + 64%|██████▍ | 2855/4476 [18:31:05<10:28:28, 23.26s/it] + 64%|██████▍ | 2856/4476 [18:31:28<10:26:30, 23.20s/it] + 64%|██████▍ | 2857/4476 [18:31:51<10:24:15, 23.13s/it] + 64%|██████▍ | 2858/4476 [18:32:15<10:26:01, 23.21s/it] + 64%|██████▍ | 2859/4476 [18:32:38<10:23:32, 23.14s/it] + 64%|██████▍ | 2860/4476 [18:33:01<10:25:30, 23.22s/it] + +{'loss': 0.3774, 'learning_rate': 1.4445146526213415e-05, 'epoch': 1.92} + + 64%|██████▍ | 2860/4476 [18:33:01<10:25:30, 23.22s/it] + 64%|██████▍ | 2861/4476 [18:33:24<10:27:00, 23.29s/it] + 64%|██████▍ | 2862/4476 [18:33:48<10:26:16, 23.28s/it] + 64%|██████▍ | 2863/4476 [18:34:11<10:24:10, 23.22s/it] + 64%|██████▍ | 2864/4476 [18:34:34<10:25:03, 23.27s/it] + 64%|██████▍ | 2865/4476 [18:34:57<10:25:02, 23.28s/it] + 64%|██████▍ | 2866/4476 [18:35:21<10:25:08, 23.30s/it] + 64%|██████▍ | 2867/4476 [18:35:44<10:24:22, 23.28s/it] + 64%|██████▍ | 2868/4476 [18:36:07<10:23:39, 23.27s/it] + 64%|██████▍ | 2869/4476 [18:36:30<10:21:20, 23.20s/it] + 64%|██████▍ | 2870/4476 [18:36:54<10:22:51, 23.27s/it] + +{'loss': 0.3879, 'learning_rate': 1.4286344428402454e-05, 'epoch': 1.92} + + 64%|██████▍ | 2870/4476 [18:36:54<10:22:51, 23.27s/it] + 64%|██████▍ | 2871/4476 [18:37:17<10:23:29, 23.31s/it] + 64%|██████▍ | 2872/4476 [18:37:41<10:23:11, 23.31s/it] + 64%|██████▍ | 2873/4476 [18:38:04<10:24:40, 23.38s/it] + 64%|██████▍ | 2874/4476 [18:38:27<10:23:48, 23.36s/it] + 64%|██████▍ | 2875/4476 [18:38:51<10:23:11, 23.36s/it] + 64%|██████▍ | 2876/4476 [18:39:14<10:23:59, 23.40s/it] + 64%|██████▍ | 2877/4476 [18:39:38<10:23:38, 23.40s/it] + 64%|██████▍ | 2878/4476 [18:40:01<10:23:52, 23.42s/it] + 64%|██████▍ | 2879/4476 [18:40:24<10:20:44, 23.32s/it] + 64%|██████▍ | 2880/4476 [18:40:47<10:19:18, 23.28s/it] + +{'loss': 0.3777, 'learning_rate': 1.412807011370052e-05, 'epoch': 1.93} + + 64%|██████▍ | 2880/4476 [18:40:47<10:19:18, 23.28s/it] + 64%|██████▍ | 2881/4476 [18:41:10<10:15:59, 23.17s/it] + 64%|██████▍ | 2882/4476 [18:41:34<10:16:21, 23.20s/it] + 64%|██████▍ | 2883/4476 [18:41:57<10:16:26, 23.22s/it] + 64%|██████▍ | 2884/4476 [18:42:20<10:18:15, 23.30s/it] + 64%|██████▍ | 2885/4476 [18:42:44<10:18:28, 23.32s/it] + 64%|██████▍ | 2886/4476 [18:43:07<10:18:03, 23.32s/it] + 64%|██████▍ | 2887/4476 [18:43:30<10:18:59, 23.37s/it] + 65%|██████▍ | 2888/4476 [18:43:54<10:16:48, 23.31s/it] + 65%|██████▍ | 2889/4476 [18:44:17<10:17:15, 23.34s/it] + 65%|██████▍ | 2890/4476 [18:44:41<10:19:01, 23.42s/it] + +{'loss': 0.3806, 'learning_rate': 1.3970331379120455e-05, 'epoch': 1.94} + + 65%|██████▍ | 2890/4476 [18:44:41<10:19:01, 23.42s/it] + 65%|██████▍ | 2891/4476 [18:45:04<10:16:49, 23.35s/it] + 65%|██████▍ | 2892/4476 [18:45:27<10:17:54, 23.41s/it] + 65%|██████▍ | 2893/4476 [18:45:51<10:16:21, 23.36s/it] + 65%|██████▍ | 2894/4476 [18:46:14<10:17:02, 23.40s/it] + 65%|██████▍ | 2895/4476 [18:46:37<10:15:17, 23.35s/it] + 65%|██████▍ | 2896/4476 [18:47:01<10:14:51, 23.35s/it] + 65%|██████▍ | 2897/4476 [18:47:24<10:13:36, 23.32s/it] + 65%|██████▍ | 2898/4476 [18:47:47<10:13:10, 23.31s/it] + 65%|██████▍ | 2899/4476 [18:48:11<10:13:44, 23.35s/it] + 65%|██████▍ | 2900/4476 [18:48:34<10:15:49, 23.44s/it] + +{'loss': 0.3848, 'learning_rate': 1.3813135995290988e-05, 'epoch': 1.94} + + 65%|██████▍ | 2900/4476 [18:48:34<10:15:49, 23.44s/it] + 65%|██████▍ | 2901/4476 [18:48:58<10:15:15, 23.44s/it] + 65%|██████▍ | 2902/4476 [18:49:22<10:17:30, 23.54s/it] + 65%|██████▍ | 2903/4476 [18:49:45<10:14:04, 23.42s/it] + 65%|██████▍ | 2904/4476 [18:50:08<10:09:12, 23.25s/it] + 65%|██████▍ | 2905/4476 [18:50:31<10:08:42, 23.25s/it] + 65%|██████▍ | 2906/4476 [18:50:54<10:09:48, 23.30s/it] + 65%|██████▍ | 2907/4476 [18:51:17<10:05:43, 23.16s/it] + 65%|██████▍ | 2908/4476 [18:51:40<10:05:44, 23.18s/it] + 65%|██████▍ | 2909/4476 [18:52:04<10:08:32, 23.30s/it] + 65%|██████▌ | 2910/4476 [18:52:27<10:07:55, 23.29s/it] + +{'loss': 0.3745, 'learning_rate': 1.3656491706073935e-05, 'epoch': 1.95} + + 65%|██████▌ | 2910/4476 [18:52:27<10:07:55, 23.29s/it] + 65%|██████▌ | 2911/4476 [18:52:51<10:09:05, 23.35s/it] + 65%|██████▌ | 2912/4476 [18:53:14<10:07:33, 23.31s/it] + 65%|██████▌ | 2913/4476 [18:53:37<10:06:31, 23.28s/it] + 65%|██████▌ | 2914/4476 [18:54:01<10:09:07, 23.40s/it] + 65%|██████▌ | 2915/4476 [18:54:24<10:07:38, 23.36s/it] + 65%|██████▌ | 2916/4476 [18:54:47<10:05:21, 23.28s/it] + 65%|██████▌ | 2917/4476 [18:55:11<10:08:14, 23.41s/it] + 65%|██████▌ | 2918/4476 [18:55:34<10:08:21, 23.43s/it] + 65%|██████▌ | 2919/4476 [18:55:58<10:06:34, 23.37s/it] + 65%|██████▌ | 2920/4476 [18:56:21<10:04:59, 23.33s/it] + +{'loss': 0.377, 'learning_rate': 1.350040622818275e-05, 'epoch': 1.96} + + 65%|██████▌ | 2920/4476 [18:56:21<10:04:59, 23.33s/it] + 65%|██████▌ | 2921/4476 [18:56:44<10:02:20, 23.24s/it] + 65%|██████▌ | 2922/4476 [18:57:07<10:00:57, 23.20s/it] + 65%|██████▌ | 2923/4476 [18:57:30<10:02:01, 23.26s/it] + 65%|██████▌ | 2924/4476 [18:57:54<10:03:42, 23.34s/it] + 65%|██████▌ | 2925/4476 [18:58:17<10:03:24, 23.34s/it] + 65%|██████▌ | 2926/4476 [18:58:41<10:06:05, 23.46s/it] + 65%|██████▌ | 2927/4476 [18:59:04<10:02:19, 23.33s/it] + 65%|██████▌ | 2928/4476 [18:59:27<10:03:21, 23.39s/it] + 65%|██████▌ | 2929/4476 [18:59:51<10:01:01, 23.31s/it] + 65%|██████▌ | 2930/4476 [19:00:14<10:03:36, 23.43s/it] + +{'loss': 0.3783, 'learning_rate': 1.3344887250802345e-05, 'epoch': 1.96} + + 65%|██████▌ | 2930/4476 [19:00:14<10:03:36, 23.43s/it] + 65%|██████▌ | 2931/4476 [19:00:38<10:05:41, 23.52s/it] + 66%|██████▌ | 2932/4476 [19:01:01<10:02:14, 23.40s/it] + 66%|██████▌ | 2933/4476 [19:01:24<10:00:28, 23.35s/it] + 66%|██████▌ | 2934/4476 [19:01:48<9:59:30, 23.33s/it] + 66%|██████▌ | 2935/4476 [19:02:11<10:00:03, 23.36s/it] + 66%|██████▌ | 2936/4476 [19:02:34<9:58:40, 23.33s/it] + 66%|██████▌ | 2937/4476 [19:02:58<9:59:31, 23.37s/it] + 66%|██████▌ | 2938/4476 [19:03:21<10:00:00, 23.41s/it] + 66%|██████▌ | 2939/4476 [19:03:45<9:58:18, 23.36s/it] + 66%|██████▌ | 2940/4476 [19:04:08<10:00:59, 23.48s/it] + +{'loss': 0.3768, 'learning_rate': 1.3189942435210301e-05, 'epoch': 1.97} + + 66%|██████▌ | 2940/4476 [19:04:08<10:00:59, 23.48s/it] + 66%|██████▌ | 2941/4476 [19:04:32<9:59:00, 23.41s/it] + 66%|██████▌ | 2942/4476 [19:04:55<9:57:44, 23.38s/it] + 66%|██████▌ | 2943/4476 [19:05:18<9:54:45, 23.28s/it] + 66%|██████▌ | 2944/4476 [19:05:41<9:56:44, 23.37s/it] + 66%|██████▌ | 2945/4476 [19:06:05<9:57:55, 23.43s/it] + 66%|██████▌ | 2946/4476 [19:06:28<9:56:22, 23.39s/it] + 66%|██████▌ | 2947/4476 [19:06:52<9:54:55, 23.35s/it] + 66%|██████▌ | 2948/4476 [19:07:15<9:56:16, 23.41s/it] + 66%|██████▌ | 2949/4476 [19:07:39<9:57:20, 23.47s/it] + 66%|██████▌ | 2950/4476 [19:08:02<9:55:34, 23.42s/it] + +{'loss': 0.3744, 'learning_rate': 1.303557941439949e-05, 'epoch': 1.98} + + 66%|██████▌ | 2950/4476 [19:08:02<9:55:34, 23.42s/it] + 66%|██████▌ | 2951/4476 [19:08:25<9:54:24, 23.39s/it] + 66%|██████▌ | 2952/4476 [19:08:49<9:53:29, 23.37s/it] + 66%|██████▌ | 2953/4476 [19:09:12<9:51:57, 23.32s/it] + 66%|██████▌ | 2954/4476 [19:09:35<9:49:58, 23.26s/it] + 66%|██████▌ | 2955/4476 [19:09:59<9:53:04, 23.40s/it] + 66%|██████▌ | 2956/4476 [19:10:22<9:52:31, 23.39s/it] + 66%|██████▌ | 2957/4476 [19:10:46<9:53:20, 23.44s/it] + 66%|██████▌ | 2958/4476 [19:11:09<9:51:46, 23.39s/it] + 66%|██████▌ | 2959/4476 [19:11:32<9:49:41, 23.32s/it] + 66%|██████▌ | 2960/4476 [19:11:56<9:52:44, 23.46s/it] + +{'loss': 0.3788, 'learning_rate': 1.2881805792702031e-05, 'epoch': 1.98} + + 66%|██████▌ | 2960/4476 [19:11:56<9:52:44, 23.46s/it] + 66%|██████▌ | 2961/4476 [19:12:19<9:51:07, 23.41s/it] + 66%|██████▌ | 2962/4476 [19:12:42<9:49:50, 23.38s/it] + 66%|██████▌ | 2963/4476 [19:13:06<9:47:47, 23.31s/it] + 66%|██████▌ | 2964/4476 [19:13:29<9:47:59, 23.33s/it] + 66%|██████▌ | 2965/4476 [19:13:52<9:48:31, 23.37s/it] + 66%|██████▋ | 2966/4476 [19:14:16<9:48:32, 23.39s/it] + 66%|██████▋ | 2967/4476 [19:14:39<9:49:12, 23.43s/it] + 66%|██████▋ | 2968/4476 [19:15:03<9:50:52, 23.51s/it] + 66%|██████▋ | 2969/4476 [19:15:27<9:51:04, 23.53s/it] + 66%|██████▋ | 2970/4476 [19:15:50<9:49:52, 23.50s/it] + +{'loss': 0.3735, 'learning_rate': 1.2728629145414645e-05, 'epoch': 1.99} + + 66%|██████▋ | 2970/4476 [19:15:50<9:49:52, 23.50s/it] + 66%|██████▋ | 2971/4476 [19:16:13<9:46:45, 23.39s/it] + 66%|██████▋ | 2972/4476 [19:16:37<9:45:18, 23.35s/it] + 66%|██████▋ | 2973/4476 [19:17:00<9:47:41, 23.46s/it] + 66%|██████▋ | 2974/4476 [19:17:24<9:47:27, 23.47s/it] + 66%|██████▋ | 2975/4476 [19:17:47<9:45:22, 23.40s/it] + 66%|██████▋ | 2976/4476 [19:18:10<9:42:29, 23.30s/it] + 67%|██████▋ | 2977/4476 [19:18:34<9:44:28, 23.39s/it] + 67%|██████▋ | 2978/4476 [19:18:56<9:39:31, 23.21s/it] + 67%|██████▋ | 2979/4476 [19:19:20<9:39:57, 23.24s/it] + 67%|██████▋ | 2980/4476 [19:19:43<9:37:14, 23.15s/it] + +{'loss': 0.3819, 'learning_rate': 1.257605701842554e-05, 'epoch': 2.0} + + 67%|██████▋ | 2980/4476 [19:19:43<9:37:14, 23.15s/it] + 67%|██████▋ | 2981/4476 [19:20:06<9:38:41, 23.23s/it] + 67%|██████▋ | 2982/4476 [19:20:29<9:37:40, 23.20s/it] + 67%|██████▋ | 2983/4476 [19:20:53<9:38:09, 23.23s/it] + 67%|██████▋ | 2984/4476 [19:21:16<9:38:40, 23.27s/it] + 67%|██████▋ | 2985/4476 [19:21:39<9:38:44, 23.29s/it] + 67%|██████▋ | 2986/4476 [19:22:02<9:36:34, 23.22s/it] + 67%|██████▋ | 2987/4476 [19:22:26<9:36:58, 23.25s/it] + 67%|██████▋ | 2988/4476 [19:22:49<9:37:03, 23.27s/it] + 67%|██████▋ | 2989/4476 [19:23:12<9:36:41, 23.27s/it] + 67%|██████▋ | 2990/4476 [19:23:36<9:36:34, 23.28s/it] + +{'loss': 0.3812, 'learning_rate': 1.242409692784265e-05, 'epoch': 2.0} + + 67%|██████▋ | 2990/4476 [19:23:36<9:36:34, 23.28s/it] + 67%|██████▋ | 2991/4476 [19:23:59<9:36:34, 23.30s/it] + 67%|██████▋ | 2992/4476 [19:24:22<9:36:33, 23.31s/it] + 67%|██████▋ | 2993/4476 [19:24:46<9:36:56, 23.34s/it] + 67%|██████▋ | 2994/4476 [19:25:09<9:36:08, 23.33s/it] + 67%|██████▋ | 2995/4476 [19:25:32<9:35:01, 23.30s/it] + 67%|██████▋ | 2996/4476 [19:25:55<9:34:35, 23.29s/it] + 67%|██████▋ | 2997/4476 [19:26:19<9:33:52, 23.28s/it] + 67%|██████▋ | 2998/4476 [19:26:42<9:34:40, 23.33s/it] + 67%|██████▋ | 2999/4476 [19:27:05<9:34:25, 23.33s/it] + 67%|██████▋ | 3000/4476 [19:27:29<9:35:34, 23.40s/it] + +{'loss': 0.3769, 'learning_rate': 1.2272756359623342e-05, 'epoch': 2.01} + + 67%|██████▋ | 3000/4476 [19:27:29<9:35:34, 23.40s/it][INFO|trainer.py:2939] 2023-11-12 22:51:16,724 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3000 +[INFO|tokenization_utils_base.py:2437] 2023-11-12 22:51:16,763 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3000/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-12 22:51:16,763 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3000/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-12 22:51:16,763 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3000/added_tokens.json + + 67%|██████▋ | 3001/4476 [19:27:52<9:35:49, 23.42s/it] + 67%|██████▋ | 3002/4476 [19:28:16<9:37:38, 23.51s/it] + 67%|██████▋ | 3003/4476 [19:28:40<9:36:29, 23.48s/it] + 67%|██████▋ | 3004/4476 [19:29:03<9:34:26, 23.41s/it] + 67%|██████▋ | 3005/4476 [19:29:26<9:31:54, 23.33s/it] + 67%|██████▋ | 3006/4476 [19:29:49<9:32:41, 23.38s/it] + 67%|██████▋ | 3007/4476 [19:30:12<9:29:14, 23.25s/it] + 67%|██████▋ | 3008/4476 [19:30:36<9:28:27, 23.23s/it] + 67%|██████▋ | 3009/4476 [19:30:59<9:26:42, 23.18s/it] + 67%|██████▋ | 3010/4476 [19:31:22<9:25:59, 23.16s/it] + +{'loss': 0.3779, 'learning_rate': 1.2122042769205702e-05, 'epoch': 2.02} + + 67%|██████▋ | 3010/4476 [19:31:22<9:25:59, 23.16s/it] + 67%|██████▋ | 3011/4476 [19:31:45<9:28:21, 23.28s/it] + 67%|██████▋ | 3012/4476 [19:32:09<9:28:12, 23.29s/it] + 67%|██████▋ | 3013/4476 [19:32:32<9:29:33, 23.36s/it] + 67%|██████▋ | 3014/4476 [19:32:55<9:28:43, 23.34s/it] + 67%|██████▋ | 3015/4476 [19:33:19<9:26:22, 23.26s/it] + 67%|██████▋ | 3016/4476 [19:33:42<9:27:31, 23.32s/it] + 67%|██████▋ | 3017/4476 [19:34:05<9:27:30, 23.34s/it] + 67%|██████▋ | 3018/4476 [19:34:29<9:29:29, 23.44s/it] + 67%|██████▋ | 3019/4476 [19:34:52<9:27:19, 23.36s/it] + 67%|██████▋ | 3020/4476 [19:35:16<9:27:26, 23.38s/it] + +{'loss': 0.3817, 'learning_rate': 1.1971963581141196e-05, 'epoch': 2.02} + + 67%|██████▋ | 3020/4476 [19:35:16<9:27:26, 23.38s/it] + 67%|██████▋ | 3021/4476 [19:35:39<9:25:40, 23.33s/it] + 68%|██████▊ | 3022/4476 [19:36:03<9:27:32, 23.42s/it] + 68%|██████▊ | 3023/4476 [19:36:26<9:28:40, 23.48s/it] + 68%|██████▊ | 3024/4476 [19:36:50<9:28:34, 23.50s/it] + 68%|██████▊ | 3025/4476 [19:37:13<9:26:59, 23.45s/it] + 68%|██████▊ | 3026/4476 [19:37:37<9:27:02, 23.46s/it] + 68%|██████▊ | 3027/4476 [19:38:00<9:27:00, 23.48s/it] + 68%|██████▊ | 3028/4476 [19:38:23<9:24:01, 23.37s/it] + 68%|██████▊ | 3029/4476 [19:38:46<9:22:22, 23.32s/it] + 68%|██████▊ | 3030/4476 [19:39:09<9:18:36, 23.18s/it] + +{'loss': 0.3788, 'learning_rate': 1.1822526188728966e-05, 'epoch': 2.03} + + 68%|██████▊ | 3030/4476 [19:39:09<9:18:36, 23.18s/it] + 68%|██████▊ | 3031/4476 [19:39:32<9:17:11, 23.14s/it] + 68%|██████▊ | 3032/4476 [19:39:56<9:19:04, 23.23s/it] + 68%|██████▊ | 3033/4476 [19:40:19<9:21:33, 23.35s/it] + 68%|██████▊ | 3034/4476 [19:40:43<9:20:44, 23.33s/it] + 68%|██████▊ | 3035/4476 [19:41:06<9:21:30, 23.38s/it] + 68%|██████▊ | 3036/4476 [19:41:29<9:21:10, 23.38s/it] + 68%|██████▊ | 3037/4476 [19:41:53<9:22:43, 23.46s/it] + 68%|██████▊ | 3038/4476 [19:42:17<9:21:53, 23.44s/it] + 68%|██████▊ | 3039/4476 [19:42:40<9:20:36, 23.41s/it] + 68%|██████▊ | 3040/4476 [19:43:03<9:19:27, 23.38s/it] + +{'loss': 0.3776, 'learning_rate': 1.1673737953651601e-05, 'epoch': 2.04} + + 68%|██████▊ | 3040/4476 [19:43:03<9:19:27, 23.38s/it] + 68%|██████▊ | 3041/4476 [19:43:27<9:19:13, 23.38s/it] + 68%|██████▊ | 3042/4476 [19:43:50<9:17:16, 23.32s/it] + 68%|██████▊ | 3043/4476 [19:44:13<9:18:39, 23.39s/it] + 68%|██████▊ | 3044/4476 [19:44:36<9:16:39, 23.32s/it] + 68%|██████▊ | 3045/4476 [19:45:00<9:17:57, 23.39s/it] + 68%|██████▊ | 3046/4476 [19:45:24<9:18:27, 23.43s/it] + 68%|██████▊ | 3047/4476 [19:45:47<9:19:06, 23.48s/it] + 68%|██████▊ | 3048/4476 [19:46:11<9:18:36, 23.47s/it] + 68%|██████▊ | 3049/4476 [19:46:34<9:20:06, 23.55s/it] + 68%|██████▊ | 3050/4476 [19:46:58<9:19:47, 23.55s/it] + +{'loss': 0.3785, 'learning_rate': 1.1525606205612447e-05, 'epoch': 2.04} + + 68%|██████▊ | 3050/4476 [19:46:58<9:19:47, 23.55s/it] + 68%|██████▊ | 3051/4476 [19:47:22<9:20:46, 23.61s/it] + 68%|██████▊ | 3052/4476 [19:47:45<9:20:44, 23.63s/it] + 68%|██████▊ | 3053/4476 [19:48:08<9:16:38, 23.47s/it] + 68%|██████▊ | 3054/4476 [19:48:32<9:16:23, 23.48s/it] + 68%|██████▊ | 3055/4476 [19:48:56<9:17:17, 23.53s/it] + 68%|██████▊ | 3056/4476 [19:49:19<9:14:59, 23.45s/it] + 68%|██████▊ | 3057/4476 [19:49:42<9:13:53, 23.42s/it] + 68%|██████▊ | 3058/4476 [19:50:05<9:10:39, 23.30s/it] + 68%|██████▊ | 3059/4476 [19:50:29<9:11:47, 23.36s/it] + 68%|██████▊ | 3060/4476 [19:50:52<9:11:52, 23.38s/it] + +{'loss': 0.3858, 'learning_rate': 1.1378138241974595e-05, 'epoch': 2.05} + + 68%|██████▊ | 3060/4476 [19:50:52<9:11:52, 23.38s/it] + 68%|██████▊ | 3061/4476 [19:51:15<9:10:45, 23.35s/it] + 68%|██████▊ | 3062/4476 [19:51:38<9:08:32, 23.28s/it] + 68%|██████▊ | 3063/4476 [19:52:02<9:10:57, 23.40s/it] + 68%|██████▊ | 3064/4476 [19:52:26<9:12:03, 23.46s/it] + 68%|██████▊ | 3065/4476 [19:52:49<9:10:48, 23.42s/it] + 68%|██████▊ | 3066/4476 [19:53:13<9:11:10, 23.45s/it] + 69%|██████▊ | 3067/4476 [19:53:36<9:09:36, 23.40s/it] + 69%|██████▊ | 3068/4476 [19:53:59<9:06:01, 23.27s/it] + 69%|██████▊ | 3069/4476 [19:54:22<9:05:24, 23.26s/it] + 69%|██████▊ | 3070/4476 [19:54:46<9:06:04, 23.30s/it] + +{'loss': 0.3766, 'learning_rate': 1.1231341327401323e-05, 'epoch': 2.06} + + 69%|██████▊ | 3070/4476 [19:54:46<9:06:04, 23.30s/it] + 69%|██████▊ | 3071/4476 [19:55:09<9:05:17, 23.29s/it] + 69%|██████▊ | 3072/4476 [19:55:32<9:03:04, 23.21s/it] + 69%|██████▊ | 3073/4476 [19:55:55<9:04:09, 23.27s/it] + 69%|██████▊ | 3074/4476 [19:56:19<9:04:38, 23.31s/it] + 69%|██████▊ | 3075/4476 [19:56:42<9:04:10, 23.31s/it] + 69%|██████▊ | 3076/4476 [19:57:05<9:03:42, 23.30s/it] + 69%|██████▊ | 3077/4476 [19:57:28<9:00:48, 23.19s/it] + 69%|██████▉ | 3078/4476 [19:57:52<9:02:05, 23.27s/it] + 69%|██████▉ | 3079/4476 [19:58:15<9:04:40, 23.39s/it] + 69%|██████▉ | 3080/4476 [19:58:39<9:05:06, 23.43s/it] + +{'loss': 0.3766, 'learning_rate': 1.1085222693498256e-05, 'epoch': 2.06} + + 69%|██████▉ | 3080/4476 [19:58:39<9:05:06, 23.43s/it] + 69%|██████▉ | 3081/4476 [19:59:02<9:04:58, 23.44s/it] + 69%|██████▉ | 3082/4476 [19:59:26<9:04:55, 23.45s/it] + 69%|██████▉ | 3083/4476 [19:59:49<9:04:55, 23.47s/it] + 69%|██████▉ | 3084/4476 [20:00:13<9:03:38, 23.43s/it] + 69%|██████▉ | 3085/4476 [20:00:36<9:02:27, 23.40s/it] + 69%|██████▉ | 3086/4476 [20:00:59<9:02:31, 23.42s/it] + 69%|██████▉ | 3087/4476 [20:01:23<9:00:39, 23.35s/it] + 69%|██████▉ | 3088/4476 [20:01:46<8:58:18, 23.27s/it] + 69%|██████▉ | 3089/4476 [20:02:09<8:58:36, 23.30s/it] + 69%|██████▉ | 3090/4476 [20:02:32<8:58:09, 23.30s/it] + +{'loss': 0.3795, 'learning_rate': 1.093978953845713e-05, 'epoch': 2.07} + + 69%|██████▉ | 3090/4476 [20:02:32<8:58:09, 23.30s/it] + 69%|██████▉ | 3091/4476 [20:02:56<8:57:22, 23.28s/it] + 69%|██████▉ | 3092/4476 [20:03:19<8:56:12, 23.25s/it] + 69%|██████▉ | 3093/4476 [20:03:42<8:55:59, 23.25s/it] + 69%|██████▉ | 3094/4476 [20:04:06<8:57:35, 23.34s/it] + 69%|██████▉ | 3095/4476 [20:04:29<8:56:35, 23.31s/it] + 69%|██████▉ | 3096/4476 [20:04:52<8:54:34, 23.24s/it] + 69%|██████▉ | 3097/4476 [20:05:15<8:54:09, 23.24s/it] + 69%|██████▉ | 3098/4476 [20:05:39<8:55:08, 23.30s/it] + 69%|██████▉ | 3099/4476 [20:06:02<8:54:40, 23.30s/it] + 69%|██████▉ | 3100/4476 [20:06:25<8:54:07, 23.29s/it] + +{'loss': 0.3837, 'learning_rate': 1.079504902670117e-05, 'epoch': 2.08} + + 69%|██████▉ | 3100/4476 [20:06:25<8:54:07, 23.29s/it] + 69%|██████▉ | 3101/4476 [20:06:49<8:55:00, 23.35s/it] + 69%|██████▉ | 3102/4476 [20:07:12<8:55:56, 23.40s/it] + 69%|██████▉ | 3103/4476 [20:07:35<8:54:09, 23.34s/it] + 69%|██████▉ | 3104/4476 [20:07:58<8:51:09, 23.23s/it] + 69%|██████▉ | 3105/4476 [20:08:22<8:51:02, 23.24s/it] + 69%|██████▉ | 3106/4476 [20:08:45<8:49:54, 23.21s/it] + 69%|██████▉ | 3107/4476 [20:09:08<8:52:41, 23.35s/it] + 69%|██████▉ | 3108/4476 [20:09:32<8:52:52, 23.37s/it] + 69%|██████▉ | 3109/4476 [20:09:55<8:51:56, 23.35s/it] + 69%|██████▉ | 3110/4476 [20:10:18<8:51:30, 23.35s/it] + +{'loss': 0.377, 'learning_rate': 1.065100828853213e-05, 'epoch': 2.08} + + 69%|██████▉ | 3110/4476 [20:10:18<8:51:30, 23.35s/it] + 70%|██████▉ | 3111/4476 [20:10:42<8:50:26, 23.32s/it] + 70%|██████▉ | 3112/4476 [20:11:05<8:49:05, 23.27s/it] + 70%|██████▉ | 3113/4476 [20:11:28<8:48:50, 23.28s/it] + 70%|██████▉ | 3114/4476 [20:11:51<8:48:57, 23.30s/it] + 70%|██████▉ | 3115/4476 [20:12:15<8:51:07, 23.41s/it] + 70%|██████▉ | 3116/4476 [20:12:38<8:50:19, 23.40s/it] + 70%|██████▉ | 3117/4476 [20:13:02<8:49:53, 23.39s/it] + 70%|██████▉ | 3118/4476 [20:13:25<8:49:38, 23.40s/it] + 70%|██████▉ | 3119/4476 [20:13:48<8:46:45, 23.29s/it] + 70%|██████▉ | 3120/4476 [20:14:12<8:46:07, 23.28s/it] + +{'loss': 0.3759, 'learning_rate': 1.0507674419779085e-05, 'epoch': 2.09} + + 70%|██████▉ | 3120/4476 [20:14:12<8:46:07, 23.28s/it] + 70%|██████▉ | 3121/4476 [20:14:35<8:45:22, 23.26s/it] + 70%|██████▉ | 3122/4476 [20:14:58<8:44:57, 23.26s/it] + 70%|██████▉ | 3123/4476 [20:15:21<8:44:05, 23.24s/it] + 70%|██████▉ | 3124/4476 [20:15:45<8:46:01, 23.34s/it] + 70%|██████▉ | 3125/4476 [20:16:08<8:45:33, 23.34s/it] + 70%|██████▉ | 3126/4476 [20:16:31<8:44:44, 23.32s/it] + 70%|██████▉ | 3127/4476 [20:16:55<8:43:46, 23.30s/it] + 70%|██████▉ | 3128/4476 [20:17:18<8:44:30, 23.35s/it] + 70%|██████▉ | 3129/4476 [20:17:41<8:41:01, 23.21s/it] + 70%|██████▉ | 3130/4476 [20:18:05<8:42:44, 23.30s/it] + +{'loss': 0.3704, 'learning_rate': 1.0365054481448849e-05, 'epoch': 2.1} + + 70%|██████▉ | 3130/4476 [20:18:05<8:42:44, 23.30s/it] + 70%|██████▉ | 3131/4476 [20:18:28<8:41:47, 23.28s/it] + 70%|██████▉ | 3132/4476 [20:18:51<8:42:33, 23.33s/it] + 70%|██████▉ | 3133/4476 [20:19:15<8:44:31, 23.43s/it] + 70%|███████ | 3134/4476 [20:19:39<8:45:52, 23.51s/it] + 70%|███████ | 3135/4476 [20:20:02<8:43:56, 23.44s/it] + 70%|███████ | 3136/4476 [20:20:25<8:42:51, 23.41s/it] + 70%|███████ | 3137/4476 [20:20:49<8:43:57, 23.48s/it] + 70%|███████ | 3138/4476 [20:21:12<8:44:13, 23.51s/it] + 70%|███████ | 3139/4476 [20:21:36<8:42:29, 23.45s/it] + 70%|███████ | 3140/4476 [20:21:59<8:41:02, 23.40s/it] + +{'loss': 0.3751, 'learning_rate': 1.02231554993781e-05, 'epoch': 2.1} + + 70%|███████ | 3140/4476 [20:21:59<8:41:02, 23.40s/it] + 70%|███████ | 3141/4476 [20:22:23<8:42:05, 23.46s/it] + 70%|███████ | 3142/4476 [20:22:46<8:38:43, 23.33s/it] + 70%|███████ | 3143/4476 [20:23:09<8:37:19, 23.29s/it] + 70%|███████ | 3144/4476 [20:23:32<8:38:58, 23.38s/it] + 70%|███████ | 3145/4476 [20:23:56<8:38:13, 23.36s/it] + 70%|███████ | 3146/4476 [20:24:19<8:37:35, 23.35s/it] + 70%|███████ | 3147/4476 [20:24:43<8:39:03, 23.43s/it] + 70%|███████ | 3148/4476 [20:25:06<8:37:42, 23.39s/it] + 70%|███████ | 3149/4476 [20:25:29<8:37:09, 23.38s/it] + 70%|███████ | 3150/4476 [20:25:53<8:37:35, 23.42s/it] + +{'loss': 0.396, 'learning_rate': 1.0081984463887325e-05, 'epoch': 2.11} + + 70%|███████ | 3150/4476 [20:25:53<8:37:35, 23.42s/it] + 70%|███████ | 3151/4476 [20:26:16<8:36:45, 23.40s/it] + 70%|███████ | 3152/4476 [20:26:40<8:38:15, 23.49s/it] + 70%|███████ | 3153/4476 [20:27:03<8:34:36, 23.34s/it] + 70%|███████ | 3154/4476 [20:27:26<8:33:47, 23.32s/it] + 70%|███████ | 3155/4476 [20:27:49<8:32:19, 23.27s/it] + 71%|███████ | 3156/4476 [20:28:13<8:35:05, 23.41s/it] + 71%|███████ | 3157/4476 [20:28:36<8:34:21, 23.40s/it] + 71%|███████ | 3158/4476 [20:29:00<8:33:13, 23.36s/it] + 71%|███████ | 3159/4476 [20:29:23<8:29:09, 23.20s/it] + 71%|███████ | 3160/4476 [20:29:46<8:29:29, 23.23s/it] + +{'loss': 0.3788, 'learning_rate': 9.941548329436425e-06, 'epoch': 2.12} + + 71%|███████ | 3160/4476 [20:29:46<8:29:29, 23.23s/it] + 71%|███████ | 3161/4476 [20:30:09<8:30:17, 23.28s/it] + 71%|███████ | 3162/4476 [20:30:33<8:32:30, 23.40s/it] + 71%|███████ | 3163/4476 [20:30:56<8:31:33, 23.38s/it] + 71%|███████ | 3164/4476 [20:31:20<8:31:43, 23.40s/it] + 71%|███████ | 3165/4476 [20:31:43<8:29:37, 23.32s/it] + 71%|███████ | 3166/4476 [20:32:06<8:30:17, 23.37s/it] + 71%|███████ | 3167/4476 [20:32:30<8:30:07, 23.38s/it] + 71%|███████ | 3168/4476 [20:32:53<8:29:21, 23.37s/it] + 71%|███████ | 3169/4476 [20:33:16<8:25:54, 23.22s/it] + 71%|███████ | 3170/4476 [20:33:39<8:27:21, 23.31s/it] + +{'loss': 0.3767, 'learning_rate': 9.801854014282108e-06, 'epoch': 2.12} + + 71%|███████ | 3170/4476 [20:33:39<8:27:21, 23.31s/it] + 71%|███████ | 3171/4476 [20:34:03<8:26:53, 23.31s/it] + 71%|███████ | 3172/4476 [20:34:26<8:27:45, 23.36s/it] + 71%|███████ | 3173/4476 [20:34:49<8:26:31, 23.32s/it] + 71%|███████ | 3174/4476 [20:35:13<8:24:46, 23.26s/it] + 71%|███████ | 3175/4476 [20:35:36<8:24:45, 23.28s/it] + 71%|███████ | 3176/4476 [20:35:59<8:24:29, 23.28s/it] + 71%|███████ | 3177/4476 [20:36:23<8:24:03, 23.28s/it] + 71%|███████ | 3178/4476 [20:36:46<8:23:45, 23.29s/it] + 71%|███████ | 3179/4476 [20:37:09<8:22:50, 23.26s/it] + 71%|███████ | 3180/4476 [20:37:32<8:21:39, 23.22s/it] + +{'loss': 0.3783, 'learning_rate': 9.662908400137125e-06, 'epoch': 2.13} + + 71%|███████ | 3180/4476 [20:37:32<8:21:39, 23.22s/it] + 71%|███████ | 3181/4476 [20:37:55<8:21:53, 23.25s/it] + 71%|███████ | 3182/4476 [20:38:19<8:22:10, 23.29s/it] + 71%|███████ | 3183/4476 [20:38:42<8:21:43, 23.28s/it] + 71%|███████ | 3184/4476 [20:39:05<8:21:25, 23.29s/it] + 71%|███████ | 3185/4476 [20:39:29<8:21:08, 23.29s/it] + 71%|███████ | 3186/4476 [20:39:52<8:21:29, 23.33s/it] + 71%|███████ | 3187/4476 [20:40:16<8:22:20, 23.38s/it] + 71%|███████ | 3188/4476 [20:40:39<8:21:31, 23.36s/it] + 71%|███████ | 3189/4476 [20:41:02<8:19:15, 23.28s/it] + 71%|███████▏ | 3190/4476 [20:41:25<8:18:15, 23.25s/it] + +{'loss': 0.3775, 'learning_rate': 9.524718331831186e-06, 'epoch': 2.14} + + 71%|███████▏ | 3190/4476 [20:41:25<8:18:15, 23.25s/it] + 71%|███████▏ | 3191/4476 [20:41:49<8:19:42, 23.33s/it] + 71%|███████▏ | 3192/4476 [20:42:12<8:19:09, 23.33s/it] + 71%|███████▏ | 3193/4476 [20:42:35<8:17:20, 23.26s/it] + 71%|███████▏ | 3194/4476 [20:42:59<8:18:20, 23.32s/it] + 71%|███████▏ | 3195/4476 [20:43:22<8:18:21, 23.34s/it] + 71%|███████▏ | 3196/4476 [20:43:45<8:18:51, 23.38s/it] + 71%|███████▏ | 3197/4476 [20:44:09<8:20:23, 23.47s/it] + 71%|███████▏ | 3198/4476 [20:44:32<8:18:39, 23.41s/it] + 71%|███████▏ | 3199/4476 [20:44:56<8:18:12, 23.41s/it] + 71%|███████▏ | 3200/4476 [20:45:19<8:17:10, 23.38s/it] + +{'loss': 0.3789, 'learning_rate': 9.387290616973859e-06, 'epoch': 2.14} + + 71%|███████▏ | 3200/4476 [20:45:19<8:17:10, 23.38s/it][INFO|trainer.py:2939] 2023-11-13 00:09:06,861 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3200 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 00:09:06,893 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 00:09:06,893 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 00:09:06,893 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3200/added_tokens.json + + 72%|███████▏ | 3201/4476 [20:45:42<8:15:08, 23.30s/it] + 72%|███████▏ | 3202/4476 [20:46:06<8:17:58, 23.45s/it] + 72%|███████▏ | 3203/4476 [20:46:29<8:15:11, 23.34s/it] + 72%|███████▏ | 3204/4476 [20:46:53<8:15:59, 23.40s/it] + 72%|███████▏ | 3205/4476 [20:47:16<8:16:37, 23.44s/it] + 72%|███████▏ | 3206/4476 [20:47:40<8:17:49, 23.52s/it] + 72%|███████▏ | 3207/4476 [20:48:03<8:16:57, 23.50s/it] + 72%|███████▏ | 3208/4476 [20:48:27<8:15:23, 23.44s/it] + 72%|███████▏ | 3209/4476 [20:48:50<8:14:13, 23.40s/it] + 72%|███████▏ | 3210/4476 [20:49:13<8:13:01, 23.37s/it] + +{'loss': 0.3776, 'learning_rate': 9.250632025619104e-06, 'epoch': 2.15} + + 72%|███████▏ | 3210/4476 [20:49:13<8:13:01, 23.37s/it] + 72%|███████▏ | 3211/4476 [20:49:36<8:11:27, 23.31s/it] + 72%|███████▏ | 3212/4476 [20:50:00<8:10:54, 23.30s/it] + 72%|███████▏ | 3213/4476 [20:50:23<8:11:21, 23.34s/it] + 72%|███████▏ | 3214/4476 [20:50:46<8:08:50, 23.24s/it] + 72%|███████▏ | 3215/4476 [20:51:10<8:09:49, 23.31s/it] + 72%|███████▏ | 3216/4476 [20:51:33<8:07:58, 23.24s/it] + 72%|███████▏ | 3217/4476 [20:51:56<8:10:45, 23.39s/it] + 72%|███████▏ | 3218/4476 [20:52:19<8:07:56, 23.27s/it] + 72%|███████▏ | 3219/4476 [20:52:43<8:09:25, 23.36s/it] + 72%|███████▏ | 3220/4476 [20:53:06<8:08:30, 23.34s/it] + +{'loss': 0.368, 'learning_rate': 9.11474928993187e-06, 'epoch': 2.16} + + 72%|███████▏ | 3220/4476 [20:53:06<8:08:30, 23.34s/it] + 72%|███████▏ | 3221/4476 [20:53:30<8:07:41, 23.32s/it] + 72%|███████▏ | 3222/4476 [20:53:53<8:07:45, 23.34s/it] + 72%|███████▏ | 3223/4476 [20:54:16<8:06:41, 23.31s/it] + 72%|███████▏ | 3224/4476 [20:54:39<8:04:16, 23.21s/it] + 72%|███████▏ | 3225/4476 [20:55:02<8:04:25, 23.23s/it] + 72%|███████▏ | 3226/4476 [20:55:26<8:05:11, 23.29s/it] + 72%|███████▏ | 3227/4476 [20:55:49<8:03:02, 23.20s/it] + 72%|███████▏ | 3228/4476 [20:56:12<8:03:11, 23.23s/it] + 72%|███████▏ | 3229/4476 [20:56:36<8:03:32, 23.27s/it] + 72%|███████▏ | 3230/4476 [20:56:59<8:04:35, 23.34s/it] + +{'loss': 0.378, 'learning_rate': 8.979649103856345e-06, 'epoch': 2.16} + + 72%|███████▏ | 3230/4476 [20:56:59<8:04:35, 23.34s/it] + 72%|███████▏ | 3231/4476 [20:57:22<8:04:22, 23.34s/it] + 72%|███████▏ | 3232/4476 [20:57:46<8:05:50, 23.43s/it] + 72%|███████▏ | 3233/4476 [20:58:09<8:03:34, 23.34s/it] + 72%|███████▏ | 3234/4476 [20:58:32<8:02:57, 23.33s/it] + 72%|███████▏ | 3235/4476 [20:58:56<8:02:55, 23.35s/it] + 72%|███████▏ | 3236/4476 [20:59:19<8:02:42, 23.36s/it] + 72%|███████▏ | 3237/4476 [20:59:43<8:03:37, 23.42s/it] + 72%|███████▏ | 3238/4476 [21:00:06<8:03:02, 23.41s/it] + 72%|███████▏ | 3239/4476 [21:00:30<8:02:10, 23.39s/it] + 72%|███████▏ | 3240/4476 [21:00:53<8:02:16, 23.41s/it] + +{'loss': 0.3776, 'learning_rate': 8.84533812278629e-06, 'epoch': 2.17} + + 72%|███████▏ | 3240/4476 [21:00:53<8:02:16, 23.41s/it] + 72%|███████▏ | 3241/4476 [21:01:17<8:02:47, 23.46s/it] + 72%|███████▏ | 3242/4476 [21:01:40<8:01:20, 23.40s/it] + 72%|███████▏ | 3243/4476 [21:02:03<8:01:38, 23.44s/it] + 72%|███████▏ | 3244/4476 [21:02:26<7:58:40, 23.31s/it] + 72%|███████▏ | 3245/4476 [21:02:50<7:59:03, 23.35s/it] + 73%|███████▎ | 3246/4476 [21:03:13<7:58:35, 23.35s/it] + 73%|███████▎ | 3247/4476 [21:03:37<7:58:44, 23.37s/it] + 73%|███████▎ | 3248/4476 [21:04:00<7:59:57, 23.45s/it] + 73%|███████▎ | 3249/4476 [21:04:24<7:59:46, 23.46s/it] + 73%|███████▎ | 3250/4476 [21:04:47<7:57:12, 23.35s/it] + +{'loss': 0.3731, 'learning_rate': 8.711822963237093e-06, 'epoch': 2.18} + + 73%|███████▎ | 3250/4476 [21:04:47<7:57:12, 23.35s/it] + 73%|███████▎ | 3251/4476 [21:05:11<7:59:09, 23.47s/it] + 73%|███████▎ | 3252/4476 [21:05:34<7:56:41, 23.37s/it] + 73%|███████▎ | 3253/4476 [21:05:57<7:54:03, 23.26s/it] + 73%|███████▎ | 3254/4476 [21:06:20<7:54:36, 23.30s/it] + 73%|███████▎ | 3255/4476 [21:06:44<7:55:52, 23.38s/it] + 73%|███████▎ | 3256/4476 [21:07:07<7:54:41, 23.35s/it] + 73%|███████▎ | 3257/4476 [21:07:30<7:54:42, 23.37s/it] + 73%|███████▎ | 3258/4476 [21:07:53<7:53:13, 23.31s/it] + 73%|███████▎ | 3259/4476 [21:08:17<7:53:53, 23.36s/it] + 73%|███████▎ | 3260/4476 [21:08:40<7:53:27, 23.36s/it] + +{'loss': 0.3827, 'learning_rate': 8.579110202519894e-06, 'epoch': 2.18} + + 73%|███████▎ | 3260/4476 [21:08:40<7:53:27, 23.36s/it] + 73%|███████▎ | 3261/4476 [21:09:04<7:54:27, 23.43s/it] + 73%|███████▎ | 3262/4476 [21:09:27<7:52:56, 23.37s/it] + 73%|███████▎ | 3263/4476 [21:09:50<7:51:24, 23.32s/it] + 73%|███████▎ | 3264/4476 [21:10:14<7:54:00, 23.47s/it] + 73%|███████▎ | 3265/4476 [21:10:37<7:52:26, 23.41s/it] + 73%|███████▎ | 3266/4476 [21:11:00<7:47:40, 23.19s/it] + 73%|███████▎ | 3267/4476 [21:11:24<7:50:44, 23.36s/it] + 73%|███████▎ | 3268/4476 [21:11:47<7:51:02, 23.40s/it] + 73%|███████▎ | 3269/4476 [21:12:10<7:47:17, 23.23s/it] + 73%|███████▎ | 3270/4476 [21:12:34<7:48:15, 23.30s/it] + +{'loss': 0.3725, 'learning_rate': 8.447206378417533e-06, 'epoch': 2.19} + + 73%|███████▎ | 3270/4476 [21:12:34<7:48:15, 23.30s/it] + 73%|███████▎ | 3271/4476 [21:12:57<7:49:07, 23.36s/it] + 73%|███████▎ | 3272/4476 [21:13:20<7:47:40, 23.31s/it] + 73%|███████▎ | 3273/4476 [21:13:43<7:45:37, 23.22s/it] + 73%|███████▎ | 3274/4476 [21:14:07<7:45:02, 23.21s/it] + 73%|███████▎ | 3275/4476 [21:14:30<7:45:03, 23.23s/it] + 73%|███████▎ | 3276/4476 [21:14:53<7:45:40, 23.28s/it] + 73%|███████▎ | 3277/4476 [21:15:17<7:46:07, 23.33s/it] + 73%|███████▎ | 3278/4476 [21:15:40<7:46:52, 23.38s/it] + 73%|███████▎ | 3279/4476 [21:16:04<7:47:22, 23.43s/it] + 73%|███████▎ | 3280/4476 [21:16:27<7:46:24, 23.40s/it] + +{'loss': 0.372, 'learning_rate': 8.31611798886246e-06, 'epoch': 2.2} + + 73%|███████▎ | 3280/4476 [21:16:27<7:46:24, 23.40s/it] + 73%|███████▎ | 3281/4476 [21:16:50<7:45:19, 23.36s/it] + 73%|███████▎ | 3282/4476 [21:17:14<7:44:24, 23.34s/it] + 73%|███████▎ | 3283/4476 [21:17:37<7:43:24, 23.31s/it] + 73%|███████▎ | 3284/4476 [21:18:00<7:42:28, 23.28s/it] + 73%|███████▎ | 3285/4476 [21:18:24<7:44:54, 23.42s/it] + 73%|███████▎ | 3286/4476 [21:18:48<7:46:43, 23.53s/it] + 73%|███████▎ | 3287/4476 [21:19:11<7:44:32, 23.44s/it] + 73%|███████▎ | 3288/4476 [21:19:34<7:43:49, 23.43s/it] + 73%|███████▎ | 3289/4476 [21:19:58<7:43:00, 23.40s/it] + 74%|███████▎ | 3290/4476 [21:20:21<7:42:24, 23.39s/it] + +{'loss': 0.3753, 'learning_rate': 8.185851491616677e-06, 'epoch': 2.2} + + 74%|███████▎ | 3290/4476 [21:20:21<7:42:24, 23.39s/it] + 74%|███████▎ | 3291/4476 [21:20:45<7:43:15, 23.46s/it] + 74%|███████▎ | 3292/4476 [21:21:08<7:40:39, 23.34s/it] + 74%|███████▎ | 3293/4476 [21:21:31<7:41:36, 23.41s/it] + 74%|███████▎ | 3294/4476 [21:21:55<7:43:20, 23.52s/it] + 74%|███████▎ | 3295/4476 [21:22:19<7:43:48, 23.56s/it] + 74%|███████▎ | 3296/4476 [21:22:42<7:43:12, 23.55s/it] + 74%|███████▎ | 3297/4476 [21:23:06<7:42:38, 23.54s/it] + 74%|███████▎ | 3298/4476 [21:23:29<7:40:52, 23.47s/it] + 74%|███████▎ | 3299/4476 [21:23:52<7:39:25, 23.42s/it] + 74%|███████▎ | 3300/4476 [21:24:16<7:38:56, 23.42s/it] + +{'loss': 0.3732, 'learning_rate': 8.0564133039536e-06, 'epoch': 2.21} + + 74%|███████▎ | 3300/4476 [21:24:16<7:38:56, 23.42s/it] + 74%|███████▎ | 3301/4476 [21:24:39<7:38:54, 23.43s/it] + 74%|███████▍ | 3302/4476 [21:25:02<7:33:10, 23.16s/it] + 74%|███████▍ | 3303/4476 [21:25:25<7:34:34, 23.25s/it] + 74%|███████▍ | 3304/4476 [21:25:49<7:36:11, 23.35s/it] + 74%|███████▍ | 3305/4476 [21:26:12<7:36:51, 23.41s/it] + 74%|███████▍ | 3306/4476 [21:26:36<7:35:38, 23.37s/it] + 74%|███████▍ | 3307/4476 [21:26:59<7:35:56, 23.40s/it] + 74%|███████▍ | 3308/4476 [21:27:23<7:36:35, 23.46s/it] + 74%|███████▍ | 3309/4476 [21:27:46<7:34:38, 23.38s/it] + 74%|███████▍ | 3310/4476 [21:28:09<7:34:29, 23.39s/it] + +{'loss': 0.37, 'learning_rate': 7.927809802341876e-06, 'epoch': 2.22} + + 74%|███████▍ | 3310/4476 [21:28:09<7:34:29, 23.39s/it] + 74%|███████▍ | 3311/4476 [21:28:33<7:34:39, 23.42s/it] + 74%|███████▍ | 3312/4476 [21:28:56<7:33:51, 23.39s/it] + 74%|███████▍ | 3313/4476 [21:29:20<7:33:58, 23.42s/it] + 74%|███████▍ | 3314/4476 [21:29:43<7:34:07, 23.45s/it] + 74%|███████▍ | 3315/4476 [21:30:06<7:32:44, 23.40s/it] + 74%|███████▍ | 3316/4476 [21:30:30<7:32:20, 23.40s/it] + 74%|███████▍ | 3317/4476 [21:30:53<7:32:00, 23.40s/it] + 74%|███████▍ | 3318/4476 [21:31:16<7:30:54, 23.36s/it] + 74%|███████▍ | 3319/4476 [21:31:40<7:30:17, 23.35s/it] + 74%|███████▍ | 3320/4476 [21:32:03<7:30:23, 23.38s/it] + +{'loss': 0.372, 'learning_rate': 7.800047322131346e-06, 'epoch': 2.22} + + 74%|███████▍ | 3320/4476 [21:32:03<7:30:23, 23.38s/it] + 74%|███████▍ | 3321/4476 [21:32:27<7:29:54, 23.37s/it] + 74%|███████▍ | 3322/4476 [21:32:49<7:26:35, 23.22s/it] + 74%|███████▍ | 3323/4476 [21:33:12<7:24:41, 23.14s/it] + 74%|███████▍ | 3324/4476 [21:33:36<7:28:21, 23.35s/it] + 74%|███████▍ | 3325/4476 [21:33:59<7:26:25, 23.27s/it] + 74%|███████▍ | 3326/4476 [21:34:22<7:24:38, 23.20s/it] + 74%|███████▍ | 3327/4476 [21:34:46<7:24:18, 23.20s/it] + 74%|███████▍ | 3328/4476 [21:35:09<7:25:30, 23.28s/it] + 74%|███████▍ | 3329/4476 [21:35:32<7:25:14, 23.29s/it] + 74%|███████▍ | 3330/4476 [21:35:56<7:24:40, 23.28s/it] + +{'loss': 0.3734, 'learning_rate': 7.673132157240877e-06, 'epoch': 2.23} + + 74%|███████▍ | 3330/4476 [21:35:56<7:24:40, 23.28s/it] + 74%|███████▍ | 3331/4476 [21:36:19<7:26:34, 23.40s/it] + 74%|███████▍ | 3332/4476 [21:36:43<7:26:44, 23.43s/it] + 74%|███████▍ | 3333/4476 [21:37:06<7:26:46, 23.45s/it] + 74%|███████▍ | 3334/4476 [21:37:30<7:26:41, 23.47s/it] + 75%|███████▍ | 3335/4476 [21:37:53<7:26:48, 23.50s/it] + 75%|███████▍ | 3336/4476 [21:38:17<7:25:58, 23.47s/it] + 75%|███████▍ | 3337/4476 [21:38:40<7:24:09, 23.40s/it] + 75%|███████▍ | 3338/4476 [21:39:03<7:22:43, 23.34s/it] + 75%|███████▍ | 3339/4476 [21:39:27<7:24:50, 23.47s/it] + 75%|███████▍ | 3340/4476 [21:39:51<7:25:58, 23.56s/it] + +{'loss': 0.3734, 'learning_rate': 7.5470705598483405e-06, 'epoch': 2.24} + + 75%|███████▍ | 3340/4476 [21:39:51<7:25:58, 23.56s/it] + 75%|███████▍ | 3341/4476 [21:40:14<7:24:32, 23.50s/it] + 75%|███████▍ | 3342/4476 [21:40:37<7:21:29, 23.36s/it] + 75%|███████▍ | 3343/4476 [21:41:01<7:22:28, 23.43s/it] + 75%|███████▍ | 3344/4476 [21:41:24<7:22:08, 23.44s/it] + 75%|███████▍ | 3345/4476 [21:41:48<7:21:47, 23.44s/it] + 75%|███████▍ | 3346/4476 [21:42:11<7:21:30, 23.44s/it] + 75%|███████▍ | 3347/4476 [21:42:34<7:20:17, 23.40s/it] + 75%|███████▍ | 3348/4476 [21:42:58<7:19:04, 23.35s/it] + 75%|███████▍ | 3349/4476 [21:43:21<7:21:11, 23.49s/it] + 75%|███████▍ | 3350/4476 [21:43:45<7:21:08, 23.51s/it] + +{'loss': 0.3784, 'learning_rate': 7.4218687400826075e-06, 'epoch': 2.24} + + 75%|███████▍ | 3350/4476 [21:43:45<7:21:08, 23.51s/it] + 75%|███████▍ | 3351/4476 [21:44:09<7:22:24, 23.60s/it] + 75%|███████▍ | 3352/4476 [21:44:32<7:21:11, 23.55s/it] + 75%|███████▍ | 3353/4476 [21:44:56<7:22:03, 23.62s/it] + 75%|███████▍ | 3354/4476 [21:45:19<7:20:23, 23.55s/it] + 75%|███████▍ | 3355/4476 [21:45:43<7:19:34, 23.53s/it] + 75%|███████▍ | 3356/4476 [21:46:06<7:17:34, 23.44s/it] + 75%|███████▌ | 3357/4476 [21:46:30<7:19:22, 23.56s/it] + 75%|███████▌ | 3358/4476 [21:46:53<7:16:42, 23.44s/it] + 75%|███████▌ | 3359/4476 [21:47:16<7:15:36, 23.40s/it] + 75%|███████▌ | 3360/4476 [21:47:40<7:13:55, 23.33s/it] + +{'loss': 0.3715, 'learning_rate': 7.297532865717638e-06, 'epoch': 2.25} + + 75%|███████▌ | 3360/4476 [21:47:40<7:13:55, 23.33s/it] + 75%|███████▌ | 3361/4476 [21:48:03<7:13:15, 23.31s/it] + 75%|███████▌ | 3362/4476 [21:48:27<7:15:16, 23.44s/it] + 75%|███████▌ | 3363/4476 [21:48:50<7:13:30, 23.37s/it] + 75%|███████▌ | 3364/4476 [21:49:13<7:12:36, 23.34s/it] + 75%|███████▌ | 3365/4476 [21:49:36<7:11:38, 23.31s/it] + 75%|███████▌ | 3366/4476 [21:50:00<7:11:48, 23.34s/it] + 75%|███████▌ | 3367/4476 [21:50:23<7:12:19, 23.39s/it] + 75%|███████▌ | 3368/4476 [21:50:46<7:11:06, 23.35s/it] + 75%|███████▌ | 3369/4476 [21:51:10<7:11:18, 23.38s/it] + 75%|███████▌ | 3370/4476 [21:51:33<7:09:56, 23.32s/it] + +{'loss': 0.3836, 'learning_rate': 7.174069061868591e-06, 'epoch': 2.26} + + 75%|███████▌ | 3370/4476 [21:51:33<7:09:56, 23.32s/it] + 75%|███████▌ | 3371/4476 [21:51:57<7:10:38, 23.38s/it] + 75%|███████▌ | 3372/4476 [21:52:20<7:11:18, 23.44s/it] + 75%|███████▌ | 3373/4476 [21:52:44<7:12:34, 23.53s/it] + 75%|███████▌ | 3374/4476 [21:53:07<7:09:03, 23.36s/it] + 75%|███████▌ | 3375/4476 [21:53:30<7:08:12, 23.34s/it] + 75%|███████▌ | 3376/4476 [21:53:53<7:06:07, 23.24s/it] + 75%|███████▌ | 3377/4476 [21:54:16<7:05:12, 23.21s/it] + 75%|███████▌ | 3378/4476 [21:54:40<7:06:44, 23.32s/it] + 75%|███████▌ | 3379/4476 [21:55:03<7:06:15, 23.31s/it] + 76%|███████▌ | 3380/4476 [21:55:26<7:05:13, 23.28s/it] + +{'loss': 0.3784, 'learning_rate': 7.05148341069014e-06, 'epoch': 2.27} + + 76%|███████▌ | 3380/4476 [21:55:26<7:05:13, 23.28s/it] + 76%|███████▌ | 3381/4476 [21:55:49<7:03:55, 23.23s/it] + 76%|███████▌ | 3382/4476 [21:56:13<7:04:56, 23.31s/it] + 76%|███████▌ | 3383/4476 [21:56:36<7:04:37, 23.31s/it] + 76%|███████▌ | 3384/4476 [21:56:59<7:03:19, 23.26s/it] + 76%|███████▌ | 3385/4476 [21:57:23<7:02:06, 23.21s/it] + 76%|███████▌ | 3386/4476 [21:57:46<7:04:44, 23.38s/it] + 76%|███████▌ | 3387/4476 [21:58:09<7:02:57, 23.30s/it] + 76%|███████▌ | 3388/4476 [21:58:33<7:02:58, 23.33s/it] + 76%|███████▌ | 3389/4476 [21:58:56<7:02:26, 23.32s/it] + 76%|███████▌ | 3390/4476 [21:59:19<7:01:14, 23.27s/it] + +{'loss': 0.3737, 'learning_rate': 6.929781951076836e-06, 'epoch': 2.27} + + 76%|███████▌ | 3390/4476 [21:59:19<7:01:14, 23.27s/it] + 76%|███████▌ | 3391/4476 [21:59:43<7:01:36, 23.31s/it] + 76%|███████▌ | 3392/4476 [22:00:06<7:03:19, 23.43s/it] + 76%|███████▌ | 3393/4476 [22:00:29<7:00:57, 23.32s/it] + 76%|███████▌ | 3394/4476 [22:00:53<7:00:52, 23.34s/it] + 76%|███████▌ | 3395/4476 [22:01:16<7:02:10, 23.43s/it] + 76%|███████▌ | 3396/4476 [22:01:39<6:58:48, 23.27s/it] + 76%|███████▌ | 3397/4476 [22:02:03<6:59:04, 23.30s/it] + 76%|███████▌ | 3398/4476 [22:02:26<6:58:46, 23.31s/it] + 76%|███████▌ | 3399/4476 [22:02:49<6:58:43, 23.33s/it] + 76%|███████▌ | 3400/4476 [22:03:13<6:57:35, 23.29s/it] + +{'loss': 0.3827, 'learning_rate': 6.80897067836557e-06, 'epoch': 2.28} + + 76%|███████▌ | 3400/4476 [22:03:13<6:57:35, 23.29s/it][INFO|trainer.py:2939] 2023-11-13 01:27:00,375 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3400 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 01:27:00,406 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 01:27:00,406 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3400/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 01:27:00,406 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3400/added_tokens.json + + 76%|███████▌ | 3401/4476 [22:03:36<6:59:48, 23.43s/it] + 76%|███████▌ | 3402/4476 [22:04:00<6:59:31, 23.44s/it] + 76%|███████▌ | 3403/4476 [22:04:23<6:58:24, 23.40s/it] + 76%|███████▌ | 3404/4476 [22:04:47<6:58:44, 23.44s/it] + 76%|███████▌ | 3405/4476 [22:05:10<6:58:53, 23.47s/it] + 76%|███████▌ | 3406/4476 [22:05:34<6:59:25, 23.52s/it] + 76%|███████▌ | 3407/4476 [22:05:57<6:58:43, 23.50s/it] + 76%|███████▌ | 3408/4476 [22:06:21<6:58:13, 23.50s/it] + 76%|███████▌ | 3409/4476 [22:06:44<6:56:08, 23.40s/it] + 76%|███████▌ | 3410/4476 [22:07:08<6:57:37, 23.51s/it] + +{'loss': 0.3808, 'learning_rate': 6.6890555440403015e-06, 'epoch': 2.29} + + 76%|███████▌ | 3410/4476 [22:07:08<6:57:37, 23.51s/it] + 76%|███████▌ | 3411/4476 [22:07:31<6:57:52, 23.54s/it] + 76%|███████▌ | 3412/4476 [22:07:55<6:56:29, 23.49s/it] + 76%|███████▋ | 3413/4476 [22:08:18<6:56:34, 23.51s/it] + 76%|███████▋ | 3414/4476 [22:08:42<6:55:03, 23.45s/it] + 76%|███████▋ | 3415/4476 [22:09:05<6:54:02, 23.41s/it] + 76%|███████▋ | 3416/4476 [22:09:28<6:53:02, 23.38s/it] + 76%|███████▋ | 3417/4476 [22:09:52<6:53:04, 23.40s/it] + 76%|███████▋ | 3418/4476 [22:10:15<6:52:19, 23.38s/it] + 76%|███████▋ | 3419/4476 [22:10:39<6:53:40, 23.48s/it] + 76%|███████▋ | 3420/4476 [22:11:02<6:51:58, 23.41s/it] + +{'loss': 0.3797, 'learning_rate': 6.570042455438822e-06, 'epoch': 2.29} + + 76%|███████▋ | 3420/4476 [22:11:02<6:51:58, 23.41s/it] + 76%|███████▋ | 3421/4476 [22:11:26<6:53:28, 23.52s/it] + 76%|███████▋ | 3422/4476 [22:11:49<6:53:09, 23.52s/it] + 76%|███████▋ | 3423/4476 [22:12:12<6:50:18, 23.38s/it] + 76%|███████▋ | 3424/4476 [22:12:36<6:49:04, 23.33s/it] + 77%|███████▋ | 3425/4476 [22:12:59<6:49:17, 23.37s/it] + 77%|███████▋ | 3426/4476 [22:13:22<6:49:24, 23.40s/it] + 77%|███████▋ | 3427/4476 [22:13:46<6:48:12, 23.35s/it] + 77%|███████▋ | 3428/4476 [22:14:09<6:47:59, 23.36s/it] + 77%|███████▋ | 3429/4476 [22:14:32<6:46:43, 23.31s/it] + 77%|███████▋ | 3430/4476 [22:14:56<6:47:29, 23.37s/it] + +{'loss': 0.3739, 'learning_rate': 6.451937275461736e-06, 'epoch': 2.3} + + 77%|███████▋ | 3430/4476 [22:14:56<6:47:29, 23.37s/it] + 77%|███████▋ | 3431/4476 [22:15:19<6:47:32, 23.40s/it] + 77%|███████▋ | 3432/4476 [22:15:43<6:47:02, 23.39s/it] + 77%|███████▋ | 3433/4476 [22:16:06<6:46:20, 23.38s/it] + 77%|███████▋ | 3434/4476 [22:16:29<6:45:57, 23.38s/it] + 77%|███████▋ | 3435/4476 [22:16:53<6:44:43, 23.33s/it] + 77%|███████▋ | 3436/4476 [22:17:16<6:45:40, 23.40s/it] + 77%|███████▋ | 3437/4476 [22:17:39<6:44:56, 23.38s/it] + 77%|███████▋ | 3438/4476 [22:18:03<6:44:49, 23.40s/it] + 77%|███████▋ | 3439/4476 [22:18:26<6:43:34, 23.35s/it] + 77%|███████▋ | 3440/4476 [22:18:50<6:43:37, 23.38s/it] + +{'loss': 0.3748, 'learning_rate': 6.334745822283699e-06, 'epoch': 2.31} + + 77%|███████▋ | 3440/4476 [22:18:50<6:43:37, 23.38s/it] + 77%|███████▋ | 3441/4476 [22:19:13<6:41:46, 23.29s/it] + 77%|███████▋ | 3442/4476 [22:19:36<6:41:33, 23.30s/it] + 77%|███████▋ | 3443/4476 [22:19:59<6:40:38, 23.27s/it] + 77%|███████▋ | 3444/4476 [22:20:23<6:40:33, 23.29s/it] + 77%|███████▋ | 3445/4476 [22:20:46<6:39:01, 23.22s/it] + 77%|███████▋ | 3446/4476 [22:21:09<6:39:34, 23.28s/it] + 77%|███████▋ | 3447/4476 [22:21:33<6:41:35, 23.42s/it] + 77%|███████▋ | 3448/4476 [22:21:56<6:41:20, 23.42s/it] + 77%|███████▋ | 3449/4476 [22:22:20<6:41:02, 23.43s/it] + 77%|███████▋ | 3450/4476 [22:22:43<6:39:20, 23.35s/it] + +{'loss': 0.3769, 'learning_rate': 6.2184738690667214e-06, 'epoch': 2.31} + + 77%|███████▋ | 3450/4476 [22:22:43<6:39:20, 23.35s/it] + 77%|███████▋ | 3451/4476 [22:23:06<6:38:00, 23.30s/it] + 77%|███████▋ | 3452/4476 [22:23:29<6:37:56, 23.32s/it] + 77%|███████▋ | 3453/4476 [22:23:53<6:38:30, 23.37s/it] + 77%|███████▋ | 3454/4476 [22:24:16<6:37:25, 23.33s/it] + 77%|███████▋ | 3455/4476 [22:24:39<6:37:07, 23.34s/it] + 77%|███████▋ | 3456/4476 [22:25:03<6:36:29, 23.32s/it] + 77%|███████▋ | 3457/4476 [22:25:26<6:38:21, 23.46s/it] + 77%|███████▋ | 3458/4476 [22:25:50<6:39:12, 23.53s/it] + 77%|███████▋ | 3459/4476 [22:26:14<6:39:43, 23.58s/it] + 77%|███████▋ | 3460/4476 [22:26:37<6:38:23, 23.53s/it] + +{'loss': 0.3756, 'learning_rate': 6.103127143675832e-06, 'epoch': 2.32} + + 77%|███████▋ | 3460/4476 [22:26:37<6:38:23, 23.53s/it] + 77%|███████▋ | 3461/4476 [22:27:00<6:35:59, 23.41s/it] + 77%|███████▋ | 3462/4476 [22:27:24<6:33:56, 23.31s/it] + 77%|███████▋ | 3463/4476 [22:27:47<6:34:49, 23.39s/it] + 77%|███████▋ | 3464/4476 [22:28:11<6:34:53, 23.41s/it] + 77%|███████▋ | 3465/4476 [22:28:34<6:33:06, 23.33s/it] + 77%|███████▋ | 3466/4476 [22:28:57<6:31:43, 23.27s/it] + 77%|███████▋ | 3467/4476 [22:29:20<6:32:44, 23.35s/it] + 77%|███████▋ | 3468/4476 [22:29:44<6:34:24, 23.48s/it] + 78%|███████▊ | 3469/4476 [22:30:07<6:31:59, 23.36s/it] + 78%|███████▊ | 3470/4476 [22:30:30<6:30:15, 23.28s/it] + +{'loss': 0.3738, 'learning_rate': 5.988711328396859e-06, 'epoch': 2.33} + + 78%|███████▊ | 3470/4476 [22:30:30<6:30:15, 23.28s/it] + 78%|███████▊ | 3471/4476 [22:30:54<6:30:54, 23.34s/it] + 78%|███████▊ | 3472/4476 [22:31:17<6:29:36, 23.28s/it] + 78%|███████▊ | 3473/4476 [22:31:40<6:29:41, 23.31s/it] + 78%|███████▊ | 3474/4476 [22:32:03<6:28:44, 23.28s/it] + 78%|███████▊ | 3475/4476 [22:32:27<6:28:41, 23.30s/it] + 78%|███████▊ | 3476/4476 [22:32:50<6:27:06, 23.23s/it] + 78%|███████▊ | 3477/4476 [22:33:13<6:26:16, 23.20s/it] + 78%|███████▊ | 3478/4476 [22:33:36<6:25:46, 23.19s/it] + 78%|███████▊ | 3479/4476 [22:33:59<6:25:03, 23.17s/it] + 78%|███████▊ | 3480/4476 [22:34:23<6:26:09, 23.26s/it] + +{'loss': 0.3676, 'learning_rate': 5.875232059656552e-06, 'epoch': 2.33} + + 78%|███████▊ | 3480/4476 [22:34:23<6:26:09, 23.26s/it] + 78%|███████▊ | 3481/4476 [22:34:46<6:26:03, 23.28s/it] + 78%|███████▊ | 3482/4476 [22:35:09<6:25:00, 23.24s/it] + 78%|███████▊ | 3483/4476 [22:35:33<6:26:04, 23.33s/it] + 78%|███████▊ | 3484/4476 [22:35:56<6:24:11, 23.24s/it] + 78%|███████▊ | 3485/4476 [22:36:20<6:26:03, 23.37s/it] + 78%|███████▊ | 3486/4476 [22:36:43<6:25:55, 23.39s/it] + 78%|███████▊ | 3487/4476 [22:37:06<6:25:25, 23.38s/it] + 78%|███████▊ | 3488/4476 [22:37:30<6:24:16, 23.34s/it] + 78%|███████▊ | 3489/4476 [22:37:53<6:22:49, 23.27s/it] + 78%|███████▊ | 3490/4476 [22:38:16<6:22:07, 23.25s/it] + +{'loss': 0.3737, 'learning_rate': 5.762694927744866e-06, 'epoch': 2.34} + + 78%|███████▊ | 3490/4476 [22:38:16<6:22:07, 23.25s/it] + 78%|███████▊ | 3491/4476 [22:38:39<6:22:39, 23.31s/it] + 78%|███████▊ | 3492/4476 [22:39:03<6:22:15, 23.31s/it] + 78%|███████▊ | 3493/4476 [22:39:26<6:22:36, 23.35s/it] + 78%|███████▊ | 3494/4476 [22:39:50<6:22:41, 23.38s/it] + 78%|███████▊ | 3495/4476 [22:40:13<6:20:39, 23.28s/it] + 78%|███████▊ | 3496/4476 [22:40:36<6:20:38, 23.30s/it] + 78%|███████▊ | 3497/4476 [22:40:59<6:19:46, 23.28s/it] + 78%|███████▊ | 3498/4476 [22:41:22<6:18:15, 23.21s/it] + 78%|███████▊ | 3499/4476 [22:41:46<6:18:47, 23.26s/it] + 78%|███████▊ | 3500/4476 [22:42:09<6:18:51, 23.29s/it] + +{'loss': 0.369, 'learning_rate': 5.651105476539623e-06, 'epoch': 2.35} + + 78%|███████▊ | 3500/4476 [22:42:09<6:18:51, 23.29s/it] + 78%|███████▊ | 3501/4476 [22:42:32<6:18:29, 23.29s/it] + 78%|███████▊ | 3502/4476 [22:42:56<6:18:41, 23.33s/it] + 78%|███████▊ | 3503/4476 [22:43:19<6:18:18, 23.33s/it] + 78%|███████▊ | 3504/4476 [22:43:43<6:19:54, 23.45s/it] + 78%|███████▊ | 3505/4476 [22:44:06<6:19:38, 23.46s/it] + 78%|███████▊ | 3506/4476 [22:44:29<6:17:22, 23.34s/it] + 78%|███████▊ | 3507/4476 [22:44:53<6:16:26, 23.31s/it] + 78%|███████▊ | 3508/4476 [22:45:16<6:16:52, 23.36s/it] + 78%|███████▊ | 3509/4476 [22:45:39<6:16:11, 23.34s/it] + 78%|███████▊ | 3510/4476 [22:46:03<6:15:50, 23.34s/it] + +{'loss': 0.3723, 'learning_rate': 5.540469203233347e-06, 'epoch': 2.35} + + 78%|███████▊ | 3510/4476 [22:46:03<6:15:50, 23.34s/it] + 78%|███████▊ | 3511/4476 [22:46:26<6:15:56, 23.37s/it] + 78%|███████▊ | 3512/4476 [22:46:49<6:15:34, 23.38s/it] + 78%|███████▊ | 3513/4476 [22:47:13<6:14:16, 23.32s/it] + 79%|███████▊ | 3514/4476 [22:47:36<6:14:44, 23.37s/it] + 79%|███████▊ | 3515/4476 [22:47:59<6:13:17, 23.31s/it] + 79%|███████▊ | 3516/4476 [22:48:23<6:12:56, 23.31s/it] + 79%|███████▊ | 3517/4476 [22:48:46<6:13:31, 23.37s/it] + 79%|███████▊ | 3518/4476 [22:49:09<6:13:08, 23.37s/it] + 79%|███████▊ | 3519/4476 [22:49:33<6:14:21, 23.47s/it] + 79%|███████▊ | 3520/4476 [22:49:57<6:13:21, 23.43s/it] + +{'loss': 0.3791, 'learning_rate': 5.430791558062518e-06, 'epoch': 2.36} + + 79%|███████▊ | 3520/4476 [22:49:57<6:13:21, 23.43s/it] + 79%|███████▊ | 3521/4476 [22:50:20<6:12:57, 23.43s/it] + 79%|███████▊ | 3522/4476 [22:50:43<6:11:42, 23.38s/it] + 79%|███████▊ | 3523/4476 [22:51:06<6:10:47, 23.35s/it] + 79%|███████▊ | 3524/4476 [22:51:30<6:09:19, 23.28s/it] + 79%|███████▉ | 3525/4476 [22:51:53<6:09:43, 23.33s/it] + 79%|███████▉ | 3526/4476 [22:52:16<6:09:23, 23.33s/it] + 79%|███████▉ | 3527/4476 [22:52:40<6:08:18, 23.29s/it] + 79%|███████▉ | 3528/4476 [22:53:03<6:08:15, 23.31s/it] + 79%|███████▉ | 3529/4476 [22:53:26<6:07:28, 23.28s/it] + 79%|███████▉ | 3530/4476 [22:53:50<6:08:51, 23.39s/it] + +{'loss': 0.3753, 'learning_rate': 5.322077944039039e-06, 'epoch': 2.37} + + 79%|███████▉ | 3530/4476 [22:53:50<6:08:51, 23.39s/it] + 79%|███████▉ | 3531/4476 [22:54:13<6:08:01, 23.37s/it] + 79%|███████▉ | 3532/4476 [22:54:36<6:06:22, 23.29s/it] + 79%|███████▉ | 3533/4476 [22:55:00<6:07:54, 23.41s/it] + 79%|███████▉ | 3534/4476 [22:55:23<6:06:39, 23.35s/it] + 79%|███████▉ | 3535/4476 [22:55:46<6:05:40, 23.32s/it] + 79%|███████▉ | 3536/4476 [22:56:10<6:05:23, 23.32s/it] + 79%|███████▉ | 3537/4476 [22:56:33<6:04:32, 23.29s/it] + 79%|███████▉ | 3538/4476 [22:56:56<6:03:56, 23.28s/it] + 79%|███████▉ | 3539/4476 [22:57:20<6:05:20, 23.39s/it] + 79%|███████▉ | 3540/4476 [22:57:43<6:04:03, 23.34s/it] + +{'loss': 0.3703, 'learning_rate': 5.21433371668407e-06, 'epoch': 2.37} + + 79%|███████▉ | 3540/4476 [22:57:43<6:04:03, 23.34s/it] + 79%|███████▉ | 3541/4476 [22:58:06<6:04:14, 23.37s/it] + 79%|███████▉ | 3542/4476 [22:58:30<6:03:44, 23.37s/it] + 79%|███████▉ | 3543/4476 [22:58:53<6:03:01, 23.35s/it] + 79%|███████▉ | 3544/4476 [22:59:17<6:03:35, 23.41s/it] + 79%|███████▉ | 3545/4476 [22:59:40<6:04:27, 23.49s/it] + 79%|███████▉ | 3546/4476 [23:00:04<6:04:45, 23.53s/it] + 79%|███████▉ | 3547/4476 [23:00:28<6:04:35, 23.55s/it] + 79%|███████▉ | 3548/4476 [23:00:51<6:02:43, 23.45s/it] + 79%|███████▉ | 3549/4476 [23:01:14<6:02:01, 23.43s/it] + 79%|███████▉ | 3550/4476 [23:01:37<5:59:50, 23.32s/it] + +{'loss': 0.3781, 'learning_rate': 5.107564183764219e-06, 'epoch': 2.38} + + 79%|███████▉ | 3550/4476 [23:01:37<5:59:50, 23.32s/it] + 79%|███████▉ | 3551/4476 [23:02:01<5:59:50, 23.34s/it] + 79%|███████▉ | 3552/4476 [23:02:24<6:01:06, 23.45s/it] + 79%|███████▉ | 3553/4476 [23:02:47<5:58:58, 23.34s/it] + 79%|███████▉ | 3554/4476 [23:03:11<5:58:06, 23.30s/it] + 79%|███████▉ | 3555/4476 [23:03:34<5:57:10, 23.27s/it] + 79%|███████▉ | 3556/4476 [23:03:57<5:56:33, 23.25s/it] + 79%|███████▉ | 3557/4476 [23:04:21<5:57:36, 23.35s/it] + 79%|███████▉ | 3558/4476 [23:04:44<5:57:53, 23.39s/it] + 80%|███████▉ | 3559/4476 [23:05:07<5:55:17, 23.25s/it] + 80%|███████▉ | 3560/4476 [23:05:30<5:54:00, 23.19s/it] + +{'loss': 0.3766, 'learning_rate': 5.001774605030074e-06, 'epoch': 2.39} + + 80%|███████▉ | 3560/4476 [23:05:30<5:54:00, 23.19s/it] + 80%|███████▉ | 3561/4476 [23:05:54<5:55:02, 23.28s/it] + 80%|███████▉ | 3562/4476 [23:06:17<5:55:26, 23.33s/it] + 80%|███████▉ | 3563/4476 [23:06:40<5:55:20, 23.35s/it] + 80%|███████▉ | 3564/4476 [23:07:04<5:53:43, 23.27s/it] + 80%|███████▉ | 3565/4476 [23:07:27<5:53:33, 23.29s/it] + 80%|███████▉ | 3566/4476 [23:07:50<5:52:37, 23.25s/it] + 80%|███████▉ | 3567/4476 [23:08:13<5:53:04, 23.31s/it] + 80%|███████▉ | 3568/4476 [23:08:37<5:52:30, 23.29s/it] + 80%|███████▉ | 3569/4476 [23:09:00<5:52:29, 23.32s/it] + 80%|███████▉ | 3570/4476 [23:09:23<5:50:53, 23.24s/it] + +{'loss': 0.38, 'learning_rate': 4.8969701919570454e-06, 'epoch': 2.39} + + 80%|███████▉ | 3570/4476 [23:09:23<5:50:53, 23.24s/it] + 80%|███████▉ | 3571/4476 [23:09:47<5:51:57, 23.33s/it] + 80%|███████▉ | 3572/4476 [23:10:10<5:51:33, 23.33s/it] + 80%|███████▉ | 3573/4476 [23:10:33<5:51:21, 23.35s/it] + 80%|███████▉ | 3574/4476 [23:10:57<5:50:38, 23.32s/it] + 80%|███████▉ | 3575/4476 [23:11:20<5:49:30, 23.27s/it] + 80%|███████▉ | 3576/4476 [23:11:43<5:48:19, 23.22s/it] + 80%|███████▉ | 3577/4476 [23:12:06<5:47:17, 23.18s/it] + 80%|███████▉ | 3578/4476 [23:12:29<5:48:09, 23.26s/it] + 80%|███████▉ | 3579/4476 [23:12:53<5:47:26, 23.24s/it] + 80%|███████▉ | 3580/4476 [23:13:16<5:47:45, 23.29s/it] + +{'loss': 0.3681, 'learning_rate': 4.7931561074887e-06, 'epoch': 2.4} + + 80%|███████▉ | 3580/4476 [23:13:16<5:47:45, 23.29s/it] + 80%|████████ | 3581/4476 [23:13:39<5:47:44, 23.31s/it] + 80%|████████ | 3582/4476 [23:14:03<5:47:23, 23.32s/it] + 80%|████████ | 3583/4476 [23:14:26<5:45:52, 23.24s/it] + 80%|████████ | 3584/4476 [23:14:49<5:46:36, 23.31s/it] + 80%|████████ | 3585/4476 [23:15:12<5:44:47, 23.22s/it] + 80%|████████ | 3586/4476 [23:15:36<5:44:29, 23.22s/it] + 80%|████████ | 3587/4476 [23:15:59<5:44:15, 23.23s/it] + 80%|████████ | 3588/4476 [23:16:22<5:45:57, 23.38s/it] + 80%|████████ | 3589/4476 [23:16:46<5:44:57, 23.33s/it] + 80%|████████ | 3590/4476 [23:17:08<5:41:34, 23.13s/it] + +{'loss': 0.3752, 'learning_rate': 4.690337465782366e-06, 'epoch': 2.41} + + 80%|████████ | 3590/4476 [23:17:08<5:41:34, 23.13s/it] + 80%|████████ | 3591/4476 [23:17:32<5:41:33, 23.16s/it] + 80%|████████ | 3592/4476 [23:17:55<5:40:44, 23.13s/it] + 80%|████████ | 3593/4476 [23:18:18<5:41:48, 23.23s/it] + 80%|████████ | 3594/4476 [23:18:42<5:43:49, 23.39s/it] + 80%|████████ | 3595/4476 [23:19:05<5:44:11, 23.44s/it] + 80%|████████ | 3596/4476 [23:19:29<5:42:56, 23.38s/it] + 80%|████████ | 3597/4476 [23:19:52<5:42:14, 23.36s/it] + 80%|████████ | 3598/4476 [23:20:15<5:40:56, 23.30s/it] + 80%|████████ | 3599/4476 [23:20:39<5:41:00, 23.33s/it] + 80%|████████ | 3600/4476 [23:21:02<5:42:07, 23.43s/it] + +{'loss': 0.3775, 'learning_rate': 4.588519331957241e-06, 'epoch': 2.41} + + 80%|████████ | 3600/4476 [23:21:02<5:42:07, 23.43s/it][INFO|trainer.py:2939] 2023-11-13 02:44:49,960 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3600 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 02:44:49,998 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3600/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 02:44:49,998 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3600/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 02:44:49,999 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3600/added_tokens.json + + 80%|████████ | 3601/4476 [23:21:26<5:41:52, 23.44s/it] + 80%|████████ | 3602/4476 [23:21:49<5:41:53, 23.47s/it] + 80%|████████ | 3603/4476 [23:22:12<5:40:26, 23.40s/it] + 81%|████████ | 3604/4476 [23:22:36<5:39:25, 23.36s/it] + 81%|████████ | 3605/4476 [23:22:59<5:39:45, 23.41s/it] + 81%|████████ | 3606/4476 [23:23:23<5:39:30, 23.41s/it] + 81%|████████ | 3607/4476 [23:23:46<5:38:45, 23.39s/it] + 81%|████████ | 3608/4476 [23:24:09<5:38:20, 23.39s/it] + 81%|████████ | 3609/4476 [23:24:32<5:36:41, 23.30s/it] + 81%|████████ | 3610/4476 [23:24:56<5:36:03, 23.28s/it] + +{'loss': 0.3677, 'learning_rate': 4.4877067218448285e-06, 'epoch': 2.42} + + 81%|████████ | 3610/4476 [23:24:56<5:36:03, 23.28s/it] + 81%|████████ | 3611/4476 [23:25:19<5:34:57, 23.23s/it] + 81%|████████ | 3612/4476 [23:25:42<5:34:13, 23.21s/it] + 81%|████████ | 3613/4476 [23:26:05<5:34:10, 23.23s/it] + 81%|████████ | 3614/4476 [23:26:29<5:33:58, 23.25s/it] + 81%|████████ | 3615/4476 [23:26:52<5:34:52, 23.34s/it] + 81%|████████ | 3616/4476 [23:27:15<5:32:57, 23.23s/it] + 81%|████████ | 3617/4476 [23:27:38<5:31:24, 23.15s/it] + 81%|████████ | 3618/4476 [23:28:01<5:29:48, 23.06s/it] + 81%|████████ | 3619/4476 [23:28:24<5:29:48, 23.09s/it] + 81%|████████ | 3620/4476 [23:28:47<5:29:12, 23.08s/it] + +{'loss': 0.3718, 'learning_rate': 4.38790460174188e-06, 'epoch': 2.43} + + 81%|████████ | 3620/4476 [23:28:47<5:29:12, 23.08s/it] + 81%|████████ | 3621/4476 [23:29:11<5:30:37, 23.20s/it] + 81%|████████ | 3622/4476 [23:29:34<5:31:30, 23.29s/it] + 81%|████████ | 3623/4476 [23:29:58<5:32:55, 23.42s/it] + 81%|████████ | 3624/4476 [23:30:21<5:31:25, 23.34s/it] + 81%|████████ | 3625/4476 [23:30:44<5:31:00, 23.34s/it] + 81%|████████ | 3626/4476 [23:31:08<5:30:28, 23.33s/it] + 81%|████████ | 3627/4476 [23:31:31<5:28:37, 23.22s/it] + 81%|████████ | 3628/4476 [23:31:54<5:30:06, 23.36s/it] + 81%|████████ | 3629/4476 [23:32:18<5:29:15, 23.32s/it] + 81%|████████ | 3630/4476 [23:32:41<5:29:30, 23.37s/it] + +{'loss': 0.3671, 'learning_rate': 4.289117888165708e-06, 'epoch': 2.43} + + 81%|████████ | 3630/4476 [23:32:41<5:29:30, 23.37s/it] + 81%|████████ | 3631/4476 [23:33:04<5:28:47, 23.35s/it] + 81%|████████ | 3632/4476 [23:33:27<5:27:32, 23.28s/it] + 81%|████████ | 3633/4476 [23:33:51<5:28:47, 23.40s/it] + 81%|████████ | 3634/4476 [23:34:14<5:28:09, 23.38s/it] + 81%|████████ | 3635/4476 [23:34:37<5:25:13, 23.20s/it] + 81%|████████ | 3636/4476 [23:35:01<5:25:24, 23.24s/it] + 81%|████████▏ | 3637/4476 [23:35:24<5:25:23, 23.27s/it] + 81%|████████▏ | 3638/4476 [23:35:47<5:24:42, 23.25s/it] + 81%|████████▏ | 3639/4476 [23:36:10<5:24:06, 23.23s/it] + 81%|████████▏ | 3640/4476 [23:36:33<5:23:24, 23.21s/it] + +{'loss': 0.3728, 'learning_rate': 4.191351447612032e-06, 'epoch': 2.44} + + 81%|████████▏ | 3640/4476 [23:36:33<5:23:24, 23.21s/it] + 81%|████████▏ | 3641/4476 [23:36:56<5:22:23, 23.17s/it] + 81%|████████▏ | 3642/4476 [23:37:20<5:24:39, 23.36s/it] + 81%|████████▏ | 3643/4476 [23:37:44<5:24:10, 23.35s/it] + 81%|████████▏ | 3644/4476 [23:38:07<5:23:36, 23.34s/it] + 81%|████████▏ | 3645/4476 [23:38:31<5:24:35, 23.44s/it] + 81%|████████▏ | 3646/4476 [23:38:54<5:23:05, 23.36s/it] + 81%|████████▏ | 3647/4476 [23:39:17<5:22:35, 23.35s/it] + 82%|████████▏ | 3648/4476 [23:39:40<5:22:01, 23.34s/it] + 82%|████████▏ | 3649/4476 [23:40:04<5:21:09, 23.30s/it] + 82%|████████▏ | 3650/4476 [23:40:27<5:19:43, 23.22s/it] + +{'loss': 0.3769, 'learning_rate': 4.094610096315199e-06, 'epoch': 2.45} + + 82%|████████▏ | 3650/4476 [23:40:27<5:19:43, 23.22s/it] + 82%|████████▏ | 3651/4476 [23:40:50<5:19:53, 23.27s/it] + 82%|████████▏ | 3652/4476 [23:41:14<5:21:40, 23.42s/it] + 82%|████████▏ | 3653/4476 [23:41:37<5:21:11, 23.42s/it] + 82%|████████▏ | 3654/4476 [23:42:01<5:21:09, 23.44s/it] + 82%|████████▏ | 3655/4476 [23:42:24<5:19:44, 23.37s/it] + 82%|████████▏ | 3656/4476 [23:42:47<5:18:39, 23.32s/it] + 82%|████████▏ | 3657/4476 [23:43:10<5:18:17, 23.32s/it] + 82%|████████▏ | 3658/4476 [23:43:34<5:17:31, 23.29s/it] + 82%|████████▏ | 3659/4476 [23:43:57<5:16:02, 23.21s/it] + 82%|████████▏ | 3660/4476 [23:44:20<5:15:30, 23.20s/it] + +{'loss': 0.3777, 'learning_rate': 3.998898600010928e-06, 'epoch': 2.45} + + 82%|████████▏ | 3660/4476 [23:44:20<5:15:30, 23.20s/it] + 82%|████████▏ | 3661/4476 [23:44:43<5:16:21, 23.29s/it] + 82%|████████▏ | 3662/4476 [23:45:07<5:15:45, 23.27s/it] + 82%|████████▏ | 3663/4476 [23:45:30<5:15:46, 23.30s/it] + 82%|████████▏ | 3664/4476 [23:45:53<5:15:58, 23.35s/it] + 82%|████████▏ | 3665/4476 [23:46:17<5:16:19, 23.40s/it] + 82%|████████▏ | 3666/4476 [23:46:40<5:15:29, 23.37s/it] + 82%|████████▏ | 3667/4476 [23:47:04<5:15:26, 23.40s/it] + 82%|████████▏ | 3668/4476 [23:47:27<5:14:34, 23.36s/it] + 82%|████████▏ | 3669/4476 [23:47:51<5:15:05, 23.43s/it] + 82%|████████▏ | 3670/4476 [23:48:14<5:15:50, 23.51s/it] + +{'loss': 0.3817, 'learning_rate': 3.904221673701566e-06, 'epoch': 2.46} + + 82%|████████▏ | 3670/4476 [23:48:14<5:15:50, 23.51s/it] + 82%|████████▏ | 3671/4476 [23:48:38<5:16:16, 23.57s/it] + 82%|████████▏ | 3672/4476 [23:49:01<5:14:38, 23.48s/it] + 82%|████████▏ | 3673/4476 [23:49:25<5:13:25, 23.42s/it] + 82%|████████▏ | 3674/4476 [23:49:48<5:13:27, 23.45s/it] + 82%|████████▏ | 3675/4476 [23:50:11<5:09:43, 23.20s/it] + 82%|████████▏ | 3676/4476 [23:50:34<5:10:37, 23.30s/it] + 82%|████████▏ | 3677/4476 [23:50:57<5:08:28, 23.16s/it] + 82%|████████▏ | 3678/4476 [23:51:21<5:10:09, 23.32s/it] + 82%|████████▏ | 3679/4476 [23:51:44<5:09:43, 23.32s/it] + 82%|████████▏ | 3680/4476 [23:52:07<5:08:36, 23.26s/it] + +{'loss': 0.383, 'learning_rate': 3.810583981423796e-06, 'epoch': 2.47} + + 82%|████████▏ | 3680/4476 [23:52:07<5:08:36, 23.26s/it] + 82%|████████▏ | 3681/4476 [23:52:31<5:09:52, 23.39s/it] + 82%|████████▏ | 3682/4476 [23:52:54<5:09:23, 23.38s/it] + 82%|████████▏ | 3683/4476 [23:53:17<5:07:45, 23.29s/it] + 82%|████████▏ | 3684/4476 [23:53:41<5:07:03, 23.26s/it] + 82%|████████▏ | 3685/4476 [23:54:04<5:06:43, 23.27s/it] + 82%|████████▏ | 3686/4476 [23:54:27<5:07:13, 23.33s/it] + 82%|████████▏ | 3687/4476 [23:54:51<5:06:32, 23.31s/it] + 82%|████████▏ | 3688/4476 [23:55:14<5:07:36, 23.42s/it] + 82%|████████▏ | 3689/4476 [23:55:38<5:08:16, 23.50s/it] + 82%|████████▏ | 3690/4476 [23:56:01<5:07:28, 23.47s/it] + +{'loss': 0.3719, 'learning_rate': 3.7179901360188533e-06, 'epoch': 2.47} + + 82%|████████▏ | 3690/4476 [23:56:01<5:07:28, 23.47s/it] + 82%|████████▏ | 3691/4476 [23:56:24<5:05:29, 23.35s/it] + 82%|████████▏ | 3692/4476 [23:56:48<5:05:40, 23.39s/it] + 83%|████████▎ | 3693/4476 [23:57:11<5:04:52, 23.36s/it] + 83%|████████▎ | 3694/4476 [23:57:34<5:03:58, 23.32s/it] + 83%|████████▎ | 3695/4476 [23:57:58<5:04:28, 23.39s/it] + 83%|████████▎ | 3696/4476 [23:58:21<5:03:28, 23.34s/it] + 83%|████████▎ | 3697/4476 [23:58:45<5:03:43, 23.39s/it] + 83%|████████▎ | 3698/4476 [23:59:08<5:03:41, 23.42s/it] + 83%|████████▎ | 3699/4476 [23:59:31<5:02:50, 23.39s/it] + 83%|████████▎ | 3700/4476 [23:59:55<5:02:37, 23.40s/it] + +{'loss': 0.3716, 'learning_rate': 3.626444698905329e-06, 'epoch': 2.48} + + 83%|████████▎ | 3700/4476 [23:59:55<5:02:37, 23.40s/it] + 83%|████████▎ | 3701/4476 [24:00:18<5:02:50, 23.45s/it] + 83%|████████▎ | 3702/4476 [24:00:42<5:01:51, 23.40s/it] + 83%|████████▎ | 3703/4476 [24:01:05<4:59:51, 23.28s/it] + 83%|████████▎ | 3704/4476 [24:01:28<4:59:36, 23.29s/it] + 83%|████████▎ | 3705/4476 [24:01:51<4:59:15, 23.29s/it] + 83%|████████▎ | 3706/4476 [24:02:14<4:58:20, 23.25s/it] + 83%|████████▎ | 3707/4476 [24:02:38<4:58:23, 23.28s/it] + 83%|████████▎ | 3708/4476 [24:03:01<4:57:57, 23.28s/it] + 83%|████████▎ | 3709/4476 [24:03:24<4:57:56, 23.31s/it] + 83%|████████▎ | 3710/4476 [24:03:48<4:56:38, 23.24s/it] + +{'loss': 0.3736, 'learning_rate': 3.5359521798544347e-06, 'epoch': 2.49} + + 83%|████████▎ | 3710/4476 [24:03:48<4:56:38, 23.24s/it] + 83%|████████▎ | 3711/4476 [24:04:11<4:57:12, 23.31s/it] + 83%|████████▎ | 3712/4476 [24:04:34<4:56:25, 23.28s/it] + 83%|████████▎ | 3713/4476 [24:04:57<4:55:40, 23.25s/it] + 83%|████████▎ | 3714/4476 [24:05:21<4:56:28, 23.34s/it] + 83%|████████▎ | 3715/4476 [24:05:44<4:56:40, 23.39s/it] + 83%|████████▎ | 3716/4476 [24:06:08<4:55:42, 23.35s/it] + 83%|████████▎ | 3717/4476 [24:06:31<4:54:18, 23.27s/it] + 83%|████████▎ | 3718/4476 [24:06:54<4:54:17, 23.30s/it] + 83%|████████▎ | 3719/4476 [24:07:17<4:53:36, 23.27s/it] + 83%|████████▎ | 3720/4476 [24:07:41<4:53:01, 23.26s/it] + +{'loss': 0.3741, 'learning_rate': 3.4465170367678294e-06, 'epoch': 2.49} + + 83%|████████▎ | 3720/4476 [24:07:41<4:53:01, 23.26s/it] + 83%|████████▎ | 3721/4476 [24:08:04<4:52:32, 23.25s/it] + 83%|████████▎ | 3722/4476 [24:08:27<4:53:22, 23.35s/it] + 83%|████████▎ | 3723/4476 [24:08:51<4:52:18, 23.29s/it] + 83%|████████▎ | 3724/4476 [24:09:14<4:53:37, 23.43s/it] + 83%|████████▎ | 3725/4476 [24:09:38<4:53:36, 23.46s/it] + 83%|████████▎ | 3726/4476 [24:10:01<4:52:46, 23.42s/it] + 83%|████████▎ | 3727/4476 [24:10:24<4:51:52, 23.38s/it] + 83%|████████▎ | 3728/4476 [24:10:48<4:51:59, 23.42s/it] + 83%|████████▎ | 3729/4476 [24:11:11<4:51:31, 23.42s/it] + 83%|████████▎ | 3730/4476 [24:11:35<4:50:30, 23.37s/it] + +{'loss': 0.3756, 'learning_rate': 3.3581436754580363e-06, 'epoch': 2.5} + + 83%|████████▎ | 3730/4476 [24:11:35<4:50:30, 23.37s/it] + 83%|████████▎ | 3731/4476 [24:11:58<4:50:21, 23.38s/it] + 83%|████████▎ | 3732/4476 [24:12:21<4:49:01, 23.31s/it] + 83%|████████▎ | 3733/4476 [24:12:44<4:47:15, 23.20s/it] + 83%|████████▎ | 3734/4476 [24:13:07<4:47:19, 23.23s/it] + 83%|████████▎ | 3735/4476 [24:13:31<4:46:33, 23.20s/it] + 83%|████████▎ | 3736/4476 [24:13:54<4:47:04, 23.28s/it] + 83%|████████▎ | 3737/4476 [24:14:17<4:46:43, 23.28s/it] + 84%|████████▎ | 3738/4476 [24:14:41<4:46:41, 23.31s/it] + 84%|████████▎ | 3739/4476 [24:15:04<4:47:04, 23.37s/it] + 84%|████████▎ | 3740/4476 [24:15:28<4:46:38, 23.37s/it] + +{'loss': 0.3777, 'learning_rate': 3.270836449431397e-06, 'epoch': 2.51} + + 84%|████████▎ | 3740/4476 [24:15:28<4:46:38, 23.37s/it] + 84%|████████▎ | 3741/4476 [24:15:51<4:47:41, 23.48s/it] + 84%|████████▎ | 3742/4476 [24:16:15<4:47:41, 23.52s/it] + 84%|████████▎ | 3743/4476 [24:16:38<4:46:40, 23.47s/it] + 84%|████████▎ | 3744/4476 [24:17:02<4:45:55, 23.44s/it] + 84%|████████▎ | 3745/4476 [24:17:25<4:44:30, 23.35s/it] + 84%|████████▎ | 3746/4476 [24:17:48<4:43:47, 23.33s/it] + 84%|████████▎ | 3747/4476 [24:18:11<4:43:36, 23.34s/it] + 84%|████████▎ | 3748/4476 [24:18:35<4:43:58, 23.40s/it] + 84%|████████▍ | 3749/4476 [24:18:59<4:44:37, 23.49s/it] + 84%|████████▍ | 3750/4476 [24:19:22<4:44:23, 23.50s/it] + +{'loss': 0.3774, 'learning_rate': 3.184599659673579e-06, 'epoch': 2.51} + + 84%|████████▍ | 3750/4476 [24:19:22<4:44:23, 23.50s/it] + 84%|████████▍ | 3751/4476 [24:19:46<4:44:05, 23.51s/it] + 84%|████████▍ | 3752/4476 [24:20:10<4:44:46, 23.60s/it] + 84%|████████▍ | 3753/4476 [24:20:33<4:43:55, 23.56s/it] + 84%|████████▍ | 3754/4476 [24:20:57<4:43:21, 23.55s/it] + 84%|████████▍ | 3755/4476 [24:21:20<4:43:36, 23.60s/it] + 84%|████████▍ | 3756/4476 [24:21:44<4:41:58, 23.50s/it] + 84%|████████▍ | 3757/4476 [24:22:07<4:40:54, 23.44s/it] + 84%|████████▍ | 3758/4476 [24:22:30<4:40:07, 23.41s/it] + 84%|████████▍ | 3759/4476 [24:22:54<4:39:36, 23.40s/it] + 84%|████████▍ | 3760/4476 [24:23:17<4:39:22, 23.41s/it] + +{'loss': 0.3785, 'learning_rate': 3.0994375544377424e-06, 'epoch': 2.52} + + 84%|████████▍ | 3760/4476 [24:23:17<4:39:22, 23.41s/it] + 84%|████████▍ | 3761/4476 [24:23:40<4:39:15, 23.43s/it] + 84%|████████▍ | 3762/4476 [24:24:04<4:37:48, 23.35s/it] + 84%|████████▍ | 3763/4476 [24:24:27<4:38:55, 23.47s/it] + 84%|████████▍ | 3764/4476 [24:24:51<4:39:22, 23.54s/it] + 84%|████████▍ | 3765/4476 [24:25:14<4:37:46, 23.44s/it] + 84%|████████▍ | 3766/4476 [24:25:38<4:36:46, 23.39s/it] + 84%|████████▍ | 3767/4476 [24:26:01<4:36:12, 23.37s/it] + 84%|████████▍ | 3768/4476 [24:26:24<4:36:03, 23.40s/it] + 84%|████████▍ | 3769/4476 [24:26:48<4:35:27, 23.38s/it] + 84%|████████▍ | 3770/4476 [24:27:11<4:34:57, 23.37s/it] + +{'loss': 0.3768, 'learning_rate': 3.0153543290352164e-06, 'epoch': 2.53} + + 84%|████████▍ | 3770/4476 [24:27:11<4:34:57, 23.37s/it] + 84%|████████▍ | 3771/4476 [24:27:34<4:34:12, 23.34s/it] + 84%|████████▍ | 3772/4476 [24:27:58<4:34:36, 23.40s/it] + 84%|████████▍ | 3773/4476 [24:28:21<4:33:54, 23.38s/it] + 84%|████████▍ | 3774/4476 [24:28:45<4:33:49, 23.40s/it] + 84%|████████▍ | 3775/4476 [24:29:08<4:34:45, 23.52s/it] + 84%|████████▍ | 3776/4476 [24:29:32<4:33:40, 23.46s/it] + 84%|████████▍ | 3777/4476 [24:29:55<4:32:36, 23.40s/it] + 84%|████████▍ | 3778/4476 [24:30:18<4:31:42, 23.36s/it] + 84%|████████▍ | 3779/4476 [24:30:42<4:31:54, 23.41s/it] + 84%|████████▍ | 3780/4476 [24:31:05<4:30:34, 23.33s/it] + +{'loss': 0.377, 'learning_rate': 2.932354125628853e-06, 'epoch': 2.53} + + 84%|████████▍ | 3780/4476 [24:31:05<4:30:34, 23.33s/it] + 84%|████████▍ | 3781/4476 [24:31:28<4:29:38, 23.28s/it] + 84%|████████▍ | 3782/4476 [24:31:51<4:29:08, 23.27s/it] + 85%|████████▍ | 3783/4476 [24:32:15<4:28:44, 23.27s/it] + 85%|████████▍ | 3784/4476 [24:32:38<4:28:16, 23.26s/it] + 85%|████████▍ | 3785/4476 [24:33:01<4:28:13, 23.29s/it] + 85%|████████▍ | 3786/4476 [24:33:24<4:27:53, 23.29s/it] + 85%|████████▍ | 3787/4476 [24:33:48<4:27:00, 23.25s/it] + 85%|████████▍ | 3788/4476 [24:34:11<4:27:17, 23.31s/it] + 85%|████████▍ | 3789/4476 [24:34:35<4:27:49, 23.39s/it] + 85%|████████▍ | 3790/4476 [24:34:58<4:27:07, 23.36s/it] + +{'loss': 0.3803, 'learning_rate': 2.8504410330289778e-06, 'epoch': 2.54} + + 85%|████████▍ | 3790/4476 [24:34:58<4:27:07, 23.36s/it] + 85%|████████▍ | 3791/4476 [24:35:22<4:28:02, 23.48s/it] + 85%|████████▍ | 3792/4476 [24:35:45<4:26:46, 23.40s/it] + 85%|████████▍ | 3793/4476 [24:36:08<4:25:15, 23.30s/it] + 85%|████████▍ | 3794/4476 [24:36:31<4:23:39, 23.20s/it] + 85%|████████▍ | 3795/4476 [24:36:54<4:23:48, 23.24s/it] + 85%|████████▍ | 3796/4476 [24:37:18<4:23:21, 23.24s/it] + 85%|████████▍ | 3797/4476 [24:37:41<4:23:06, 23.25s/it] + 85%|████████▍ | 3798/4476 [24:38:04<4:23:03, 23.28s/it] + 85%|████████▍ | 3799/4476 [24:38:27<4:22:40, 23.28s/it] + 85%|████████▍ | 3800/4476 [24:38:51<4:22:21, 23.29s/it] + +{'loss': 0.3706, 'learning_rate': 2.769619086491923e-06, 'epoch': 2.55} + + 85%|████████▍ | 3800/4476 [24:38:51<4:22:21, 23.29s/it][INFO|trainer.py:2939] 2023-11-13 04:02:38,481 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3800 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 04:02:38,512 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3800/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 04:02:38,512 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3800/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 04:02:38,513 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-3800/added_tokens.json + + 85%|████████▍ | 3801/4476 [24:39:14<4:22:12, 23.31s/it] + 85%|████████▍ | 3802/4476 [24:39:38<4:22:30, 23.37s/it] + 85%|████████▍ | 3803/4476 [24:40:01<4:21:32, 23.32s/it] + 85%|████████▍ | 3804/4476 [24:40:24<4:21:07, 23.32s/it] + 85%|████████▌ | 3805/4476 [24:40:47<4:20:18, 23.28s/it] + 85%|████████▌ | 3806/4476 [24:41:11<4:20:12, 23.30s/it] + 85%|████████▌ | 3807/4476 [24:41:34<4:19:54, 23.31s/it] + 85%|████████▌ | 3808/4476 [24:41:57<4:20:02, 23.36s/it] + 85%|████████▌ | 3809/4476 [24:42:21<4:19:35, 23.35s/it] + 85%|████████▌ | 3810/4476 [24:42:45<4:20:21, 23.46s/it] + +{'loss': 0.3712, 'learning_rate': 2.6898922675213016e-06, 'epoch': 2.55} + + 85%|████████▌ | 3810/4476 [24:42:45<4:20:21, 23.46s/it] + 85%|████████▌ | 3811/4476 [24:43:08<4:19:28, 23.41s/it] + 85%|████████▌ | 3812/4476 [24:43:31<4:17:57, 23.31s/it] + 85%|████████▌ | 3813/4476 [24:43:54<4:17:17, 23.28s/it] + 85%|████████▌ | 3814/4476 [24:44:17<4:17:14, 23.32s/it] + 85%|████████▌ | 3815/4476 [24:44:41<4:18:00, 23.42s/it] + 85%|████████▌ | 3816/4476 [24:45:04<4:17:04, 23.37s/it] + 85%|████████▌ | 3817/4476 [24:45:28<4:17:05, 23.41s/it] + 85%|████████▌ | 3818/4476 [24:45:51<4:16:35, 23.40s/it] + 85%|████████▌ | 3819/4476 [24:46:15<4:15:50, 23.36s/it] + 85%|████████▌ | 3820/4476 [24:46:38<4:16:01, 23.42s/it] + +{'loss': 0.369, 'learning_rate': 2.611264503671823e-06, 'epoch': 2.56} + + 85%|████████▌ | 3820/4476 [24:46:38<4:16:01, 23.42s/it] + 85%|████████▌ | 3821/4476 [24:47:01<4:15:18, 23.39s/it] + 85%|████████▌ | 3822/4476 [24:47:25<4:15:33, 23.45s/it] + 85%|████████▌ | 3823/4476 [24:47:48<4:14:13, 23.36s/it] + 85%|████████▌ | 3824/4476 [24:48:11<4:13:33, 23.33s/it] + 85%|████████▌ | 3825/4476 [24:48:35<4:13:10, 23.33s/it] + 85%|████████▌ | 3826/4476 [24:48:58<4:11:27, 23.21s/it] + 86%|████████▌ | 3827/4476 [24:49:21<4:11:35, 23.26s/it] + 86%|████████▌ | 3828/4476 [24:49:45<4:11:51, 23.32s/it] + 86%|████████▌ | 3829/4476 [24:50:08<4:11:24, 23.31s/it] + 86%|████████▌ | 3830/4476 [24:50:31<4:10:57, 23.31s/it] + +{'loss': 0.3726, 'learning_rate': 2.533739668355814e-06, 'epoch': 2.57} + + 86%|████████▌ | 3830/4476 [24:50:31<4:10:57, 23.31s/it] + 86%|████████▌ | 3831/4476 [24:50:54<4:10:36, 23.31s/it] + 86%|████████▌ | 3832/4476 [24:51:18<4:10:13, 23.31s/it] + 86%|████████▌ | 3833/4476 [24:51:41<4:10:06, 23.34s/it] + 86%|████████▌ | 3834/4476 [24:52:05<4:10:58, 23.46s/it] + 86%|████████▌ | 3835/4476 [24:52:28<4:11:01, 23.50s/it] + 86%|████████▌ | 3836/4476 [24:52:52<4:10:45, 23.51s/it] + 86%|████████▌ | 3837/4476 [24:53:15<4:09:35, 23.44s/it] + 86%|████████▌ | 3838/4476 [24:53:38<4:07:57, 23.32s/it] + 86%|████████▌ | 3839/4476 [24:54:01<4:06:45, 23.24s/it] + 86%|████████▌ | 3840/4476 [24:54:25<4:07:16, 23.33s/it] + +{'loss': 0.3746, 'learning_rate': 2.45732158065243e-06, 'epoch': 2.57} + + 86%|████████▌ | 3840/4476 [24:54:25<4:07:16, 23.33s/it] + 86%|████████▌ | 3841/4476 [24:54:48<4:06:40, 23.31s/it] + 86%|████████▌ | 3842/4476 [24:55:11<4:05:22, 23.22s/it] + 86%|████████▌ | 3843/4476 [24:55:35<4:05:39, 23.29s/it] + 86%|████████▌ | 3844/4476 [24:55:58<4:04:52, 23.25s/it] + 86%|████████▌ | 3845/4476 [24:56:21<4:05:01, 23.30s/it] + 86%|████████▌ | 3846/4476 [24:56:44<4:03:49, 23.22s/it] + 86%|████████▌ | 3847/4476 [24:57:08<4:04:00, 23.28s/it] + 86%|████████▌ | 3848/4476 [24:57:31<4:04:08, 23.33s/it] + 86%|████████▌ | 3849/4476 [24:57:55<4:04:06, 23.36s/it] + 86%|████████▌ | 3850/4476 [24:58:18<4:03:17, 23.32s/it] + +{'loss': 0.3701, 'learning_rate': 2.382014005119501e-06, 'epoch': 2.58} + + 86%|████████▌ | 3850/4476 [24:58:18<4:03:17, 23.32s/it] + 86%|████████▌ | 3851/4476 [24:58:41<4:02:17, 23.26s/it] + 86%|████████▌ | 3852/4476 [24:59:04<4:02:35, 23.33s/it] + 86%|████████▌ | 3853/4476 [24:59:28<4:02:05, 23.32s/it] + 86%|████████▌ | 3854/4476 [24:59:51<4:00:45, 23.22s/it] + 86%|████████▌ | 3855/4476 [25:00:14<4:01:28, 23.33s/it] + 86%|████████▌ | 3856/4476 [25:00:38<4:01:44, 23.39s/it] + 86%|████████▌ | 3857/4476 [25:01:01<4:00:46, 23.34s/it] + 86%|████████▌ | 3858/4476 [25:01:25<4:01:42, 23.47s/it] + 86%|████████▌ | 3859/4476 [25:01:48<4:00:26, 23.38s/it] + 86%|████████▌ | 3860/4476 [25:02:11<3:58:34, 23.24s/it] + +{'loss': 0.38, 'learning_rate': 2.3078206516080695e-06, 'epoch': 2.59} + + 86%|████████▌ | 3860/4476 [25:02:11<3:58:34, 23.24s/it] + 86%|████████▋ | 3861/4476 [25:02:34<3:58:39, 23.28s/it] + 86%|████████▋ | 3862/4476 [25:02:58<3:58:12, 23.28s/it] + 86%|████████▋ | 3863/4476 [25:03:21<3:58:26, 23.34s/it] + 86%|████████▋ | 3864/4476 [25:03:45<3:58:50, 23.42s/it] + 86%|████████▋ | 3865/4476 [25:04:08<3:58:02, 23.38s/it] + 86%|████████▋ | 3866/4476 [25:04:31<3:58:09, 23.42s/it] + 86%|████████▋ | 3867/4476 [25:04:55<3:58:24, 23.49s/it] + 86%|████████▋ | 3868/4476 [25:05:18<3:57:24, 23.43s/it] + 86%|████████▋ | 3869/4476 [25:05:42<3:57:08, 23.44s/it] + 86%|████████▋ | 3870/4476 [25:06:05<3:57:10, 23.48s/it] + +{'loss': 0.3725, 'learning_rate': 2.2347451750796474e-06, 'epoch': 2.59} + + 86%|████████▋ | 3870/4476 [25:06:05<3:57:10, 23.48s/it] + 86%|████████▋ | 3871/4476 [25:06:28<3:55:11, 23.32s/it] + 87%|████████▋ | 3872/4476 [25:06:52<3:55:26, 23.39s/it] + 87%|████████▋ | 3873/4476 [25:07:15<3:55:08, 23.40s/it] + 87%|████████▋ | 3874/4476 [25:07:38<3:53:12, 23.24s/it] + 87%|████████▋ | 3875/4476 [25:08:01<3:52:33, 23.22s/it] + 87%|████████▋ | 3876/4476 [25:08:24<3:51:55, 23.19s/it] + 87%|████████▋ | 3877/4476 [25:08:48<3:51:46, 23.22s/it] + 87%|████████▋ | 3878/4476 [25:09:11<3:52:14, 23.30s/it] + 87%|████████▋ | 3879/4476 [25:09:35<3:51:42, 23.29s/it] + 87%|████████▋ | 3880/4476 [25:09:58<3:51:04, 23.26s/it] + +{'loss': 0.3825, 'learning_rate': 2.1627911754261653e-06, 'epoch': 2.6} + + 87%|████████▋ | 3880/4476 [25:09:58<3:51:04, 23.26s/it] + 87%|████████▋ | 3881/4476 [25:10:21<3:50:39, 23.26s/it] + 87%|████████▋ | 3882/4476 [25:10:44<3:50:32, 23.29s/it] + 87%|████████▋ | 3883/4476 [25:11:08<3:50:10, 23.29s/it] + 87%|████████▋ | 3884/4476 [25:11:31<3:50:08, 23.32s/it] + 87%|████████▋ | 3885/4476 [25:11:54<3:49:23, 23.29s/it] + 87%|████████▋ | 3886/4476 [25:12:18<3:49:19, 23.32s/it] + 87%|████████▋ | 3887/4476 [25:12:41<3:48:31, 23.28s/it] + 87%|████████▋ | 3888/4476 [25:13:04<3:48:38, 23.33s/it] + 87%|████████▋ | 3889/4476 [25:13:27<3:47:20, 23.24s/it] + 87%|████████▋ | 3890/4476 [25:13:51<3:47:43, 23.32s/it] + +{'loss': 0.3791, 'learning_rate': 2.0919621972926156e-06, 'epoch': 2.61} + + 87%|████████▋ | 3890/4476 [25:13:51<3:47:43, 23.32s/it] + 87%|████████▋ | 3891/4476 [25:14:14<3:47:53, 23.37s/it] + 87%|████████▋ | 3892/4476 [25:14:38<3:47:09, 23.34s/it] + 87%|████████▋ | 3893/4476 [25:15:01<3:46:18, 23.29s/it] + 87%|████████▋ | 3894/4476 [25:15:24<3:46:11, 23.32s/it] + 87%|████████▋ | 3895/4476 [25:15:48<3:46:15, 23.37s/it] + 87%|████████▋ | 3896/4476 [25:16:11<3:44:55, 23.27s/it] + 87%|████████▋ | 3897/4476 [25:16:34<3:43:52, 23.20s/it] + 87%|████████▋ | 3898/4476 [25:16:57<3:43:58, 23.25s/it] + 87%|████████▋ | 3899/4476 [25:17:20<3:43:53, 23.28s/it] + 87%|████████▋ | 3900/4476 [25:17:44<3:43:53, 23.32s/it] + +{'loss': 0.3778, 'learning_rate': 2.022261729902458e-06, 'epoch': 2.61} + + 87%|████████▋ | 3900/4476 [25:17:44<3:43:53, 23.32s/it] + 87%|████████▋ | 3901/4476 [25:18:07<3:43:18, 23.30s/it] + 87%|████████▋ | 3902/4476 [25:18:31<3:43:38, 23.38s/it] + 87%|████████▋ | 3903/4476 [25:18:54<3:43:36, 23.41s/it] + 87%|████████▋ | 3904/4476 [25:19:18<3:43:24, 23.43s/it] + 87%|████████▋ | 3905/4476 [25:19:41<3:42:24, 23.37s/it] + 87%|████████▋ | 3906/4476 [25:20:04<3:42:53, 23.46s/it] + 87%|████████▋ | 3907/4476 [25:20:28<3:42:03, 23.42s/it] + 87%|████████▋ | 3908/4476 [25:20:51<3:40:36, 23.30s/it] + 87%|████████▋ | 3909/4476 [25:21:14<3:39:54, 23.27s/it] + 87%|████████▋ | 3910/4476 [25:21:38<3:40:46, 23.40s/it] + +{'loss': 0.3735, 'learning_rate': 1.953693206885715e-06, 'epoch': 2.62} + + 87%|████████▋ | 3910/4476 [25:21:38<3:40:46, 23.40s/it] + 87%|████████▋ | 3911/4476 [25:22:01<3:39:26, 23.30s/it] + 87%|████████▋ | 3912/4476 [25:22:24<3:38:47, 23.28s/it] + 87%|████████▋ | 3913/4476 [25:22:47<3:37:55, 23.22s/it] + 87%|████████▋ | 3914/4476 [25:23:11<3:38:10, 23.29s/it] + 87%|████████▋ | 3915/4476 [25:23:34<3:39:05, 23.43s/it] + 87%|████████▋ | 3916/4476 [25:23:58<3:38:45, 23.44s/it] + 88%|████████▊ | 3917/4476 [25:24:21<3:37:51, 23.38s/it] + 88%|████████▊ | 3918/4476 [25:24:44<3:36:17, 23.26s/it] + 88%|████████▊ | 3919/4476 [25:25:07<3:35:53, 23.26s/it] + 88%|████████▊ | 3920/4476 [25:25:31<3:35:35, 23.26s/it] + +{'loss': 0.3816, 'learning_rate': 1.8862600061098106e-06, 'epoch': 2.63} + + 88%|████████▊ | 3920/4476 [25:25:31<3:35:35, 23.26s/it] + 88%|████████▊ | 3921/4476 [25:25:54<3:36:15, 23.38s/it] + 88%|████████▊ | 3922/4476 [25:26:18<3:36:25, 23.44s/it] + 88%|████████▊ | 3923/4476 [25:26:41<3:34:42, 23.29s/it] + 88%|████████▊ | 3924/4476 [25:27:04<3:33:40, 23.23s/it] + 88%|████████▊ | 3925/4476 [25:27:27<3:33:22, 23.24s/it] + 88%|████████▊ | 3926/4476 [25:27:51<3:33:37, 23.30s/it] + 88%|████████▊ | 3927/4476 [25:28:14<3:34:32, 23.45s/it] + 88%|████████▊ | 3928/4476 [25:28:38<3:33:30, 23.38s/it] + 88%|████████▊ | 3929/4476 [25:29:00<3:31:48, 23.23s/it] + 88%|████████▊ | 3930/4476 [25:29:24<3:31:42, 23.26s/it] + +{'loss': 0.3752, 'learning_rate': 1.8199654495131974e-06, 'epoch': 2.63} + + 88%|████████▊ | 3930/4476 [25:29:24<3:31:42, 23.26s/it] + 88%|████████▊ | 3931/4476 [25:29:48<3:32:41, 23.42s/it] + 88%|████████▊ | 3932/4476 [25:30:11<3:32:12, 23.40s/it] + 88%|████████▊ | 3933/4476 [25:30:34<3:31:40, 23.39s/it] + 88%|████████▊ | 3934/4476 [25:30:57<3:30:49, 23.34s/it] + 88%|████████▊ | 3935/4476 [25:31:21<3:29:47, 23.27s/it] + 88%|████████▊ | 3936/4476 [25:31:44<3:30:55, 23.44s/it] + 88%|████████▊ | 3937/4476 [25:32:08<3:30:48, 23.47s/it] + 88%|████████▊ | 3938/4476 [25:32:31<3:30:00, 23.42s/it] + 88%|████████▊ | 3939/4476 [25:32:55<3:29:55, 23.46s/it] + 88%|████████▊ | 3940/4476 [25:33:18<3:28:24, 23.33s/it] + +{'loss': 0.3739, 'learning_rate': 1.754812802941691e-06, 'epoch': 2.64} + + 88%|████████▊ | 3940/4476 [25:33:18<3:28:24, 23.33s/it] + 88%|████████▊ | 3941/4476 [25:33:41<3:28:41, 23.41s/it] + 88%|████████▊ | 3942/4476 [25:34:05<3:27:58, 23.37s/it] + 88%|████████▊ | 3943/4476 [25:34:28<3:27:49, 23.39s/it] + 88%|████████▊ | 3944/4476 [25:34:52<3:28:21, 23.50s/it] + 88%|████████▊ | 3945/4476 [25:35:15<3:27:30, 23.45s/it] + 88%|████████▊ | 3946/4476 [25:35:38<3:26:19, 23.36s/it] + 88%|████████▊ | 3947/4476 [25:36:02<3:26:42, 23.45s/it] + 88%|████████▊ | 3948/4476 [25:36:25<3:26:22, 23.45s/it] + 88%|████████▊ | 3949/4476 [25:36:49<3:25:25, 23.39s/it] + 88%|████████▊ | 3950/4476 [25:37:12<3:24:49, 23.36s/it] + +{'loss': 0.3745, 'learning_rate': 1.6908052759875836e-06, 'epoch': 2.65} + + 88%|████████▊ | 3950/4476 [25:37:12<3:24:49, 23.36s/it] + 88%|████████▊ | 3951/4476 [25:37:36<3:25:23, 23.47s/it] + 88%|████████▊ | 3952/4476 [25:37:59<3:24:30, 23.42s/it] + 88%|████████▊ | 3953/4476 [25:38:22<3:23:53, 23.39s/it] + 88%|████████▊ | 3954/4476 [25:38:46<3:23:18, 23.37s/it] + 88%|████████▊ | 3955/4476 [25:39:09<3:22:44, 23.35s/it] + 88%|████████▊ | 3956/4476 [25:39:32<3:22:26, 23.36s/it] + 88%|████████▊ | 3957/4476 [25:39:56<3:21:57, 23.35s/it] + 88%|████████▊ | 3958/4476 [25:40:19<3:21:55, 23.39s/it] + 88%|████████▊ | 3959/4476 [25:40:43<3:21:58, 23.44s/it] + 88%|████████▊ | 3960/4476 [25:41:06<3:21:54, 23.48s/it] + +{'loss': 0.3753, 'learning_rate': 1.6279460218315361e-06, 'epoch': 2.65} + + 88%|████████▊ | 3960/4476 [25:41:06<3:21:54, 23.48s/it] + 88%|████████▊ | 3961/4476 [25:41:30<3:21:44, 23.50s/it] + 89%|████████▊ | 3962/4476 [25:41:54<3:22:14, 23.61s/it] + 89%|████████▊ | 3963/4476 [25:42:17<3:21:17, 23.54s/it] + 89%|████████▊ | 3964/4476 [25:42:40<3:19:55, 23.43s/it] + 89%|████████▊ | 3965/4476 [25:43:04<3:19:33, 23.43s/it] + 89%|████████▊ | 3966/4476 [25:43:27<3:20:05, 23.54s/it] + 89%|████████▊ | 3967/4476 [25:43:51<3:19:29, 23.52s/it] + 89%|████████▊ | 3968/4476 [25:44:14<3:19:06, 23.52s/it] + 89%|████████▊ | 3969/4476 [25:44:38<3:17:50, 23.41s/it] + 89%|████████▊ | 3970/4476 [25:45:01<3:17:10, 23.38s/it] + +{'loss': 0.3736, 'learning_rate': 1.5662381370872532e-06, 'epoch': 2.66} + + 89%|████████▊ | 3970/4476 [25:45:01<3:17:10, 23.38s/it] + 89%|████████▊ | 3971/4476 [25:45:24<3:16:55, 23.40s/it] + 89%|████████▊ | 3972/4476 [25:45:48<3:16:47, 23.43s/it] + 89%|████████▉ | 3973/4476 [25:46:12<3:17:04, 23.51s/it] + 89%|████████▉ | 3974/4476 [25:46:35<3:16:37, 23.50s/it] + 89%|████████▉ | 3975/4476 [25:46:58<3:15:49, 23.45s/it] + 89%|████████▉ | 3976/4476 [25:47:22<3:16:15, 23.55s/it] + 89%|████████▉ | 3977/4476 [25:47:46<3:15:20, 23.49s/it] + 89%|████████▉ | 3978/4476 [25:48:09<3:14:59, 23.49s/it] + 89%|████████▉ | 3979/4476 [25:48:32<3:13:56, 23.41s/it] + 89%|████████▉ | 3980/4476 [25:48:56<3:13:09, 23.37s/it] + +{'loss': 0.3755, 'learning_rate': 1.5056846616489124e-06, 'epoch': 2.67} + + 89%|████████▉ | 3980/4476 [25:48:56<3:13:09, 23.37s/it] + 89%|████████▉ | 3981/4476 [25:49:19<3:13:43, 23.48s/it] + 89%|████████▉ | 3982/4476 [25:49:43<3:12:45, 23.41s/it] + 89%|████████▉ | 3983/4476 [25:50:06<3:13:13, 23.52s/it] + 89%|████████▉ | 3984/4476 [25:50:30<3:12:27, 23.47s/it] + 89%|████████▉ | 3985/4476 [25:50:53<3:10:46, 23.31s/it] + 89%|████████▉ | 3986/4476 [25:51:16<3:11:18, 23.42s/it] + 89%|████████▉ | 3987/4476 [25:51:39<3:09:55, 23.30s/it] + 89%|████████▉ | 3988/4476 [25:52:03<3:09:55, 23.35s/it] + 89%|████████▉ | 3989/4476 [25:52:26<3:09:39, 23.37s/it] + 89%|████████▉ | 3990/4476 [25:52:50<3:09:52, 23.44s/it] + +{'loss': 0.3741, 'learning_rate': 1.4462885785414327e-06, 'epoch': 2.67} + + 89%|████████▉ | 3990/4476 [25:52:50<3:09:52, 23.44s/it] + 89%|████████▉ | 3991/4476 [25:53:13<3:08:37, 23.34s/it] + 89%|████████▉ | 3992/4476 [25:53:36<3:08:02, 23.31s/it] + 89%|████████▉ | 3993/4476 [25:53:59<3:07:28, 23.29s/it] + 89%|████████▉ | 3994/4476 [25:54:23<3:06:58, 23.27s/it] + 89%|████████▉ | 3995/4476 [25:54:46<3:06:35, 23.28s/it] + 89%|████████▉ | 3996/4476 [25:55:09<3:05:26, 23.18s/it] + 89%|████████▉ | 3997/4476 [25:55:32<3:05:27, 23.23s/it] + 89%|████████▉ | 3998/4476 [25:55:55<3:05:10, 23.24s/it] + 89%|████████▉ | 3999/4476 [25:56:19<3:05:25, 23.32s/it] + 89%|████████▉ | 4000/4476 [25:56:42<3:05:08, 23.34s/it] + +{'loss': 0.3708, 'learning_rate': 1.3880528137735132e-06, 'epoch': 2.68} + + 89%|████████▉ | 4000/4476 [25:56:42<3:05:08, 23.34s/it][INFO|trainer.py:2939] 2023-11-13 05:20:30,061 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4000 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 05:20:30,092 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4000/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 05:20:30,092 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4000/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 05:20:30,092 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4000/added_tokens.json + + 89%|████████▉ | 4001/4476 [25:57:06<3:04:24, 23.29s/it] + 89%|████████▉ | 4002/4476 [25:57:29<3:05:00, 23.42s/it] + 89%|████████▉ | 4003/4476 [25:57:52<3:03:47, 23.31s/it] + 89%|████████▉ | 4004/4476 [25:58:16<3:03:22, 23.31s/it] + 89%|████████▉ | 4005/4476 [25:58:39<3:02:52, 23.30s/it] + 89%|████████▉ | 4006/4476 [25:59:02<3:02:04, 23.24s/it] + 90%|████████▉ | 4007/4476 [25:59:26<3:02:50, 23.39s/it] + 90%|████████▉ | 4008/4476 [25:59:49<3:01:59, 23.33s/it] + 90%|████████▉ | 4009/4476 [26:00:12<3:01:07, 23.27s/it] + 90%|████████▉ | 4010/4476 [26:00:35<3:00:39, 23.26s/it] + +{'loss': 0.3703, 'learning_rate': 1.3309802361934936e-06, 'epoch': 2.69} + + 90%|████████▉ | 4010/4476 [26:00:35<3:00:39, 23.26s/it] + 90%|████████▉ | 4011/4476 [26:00:59<3:01:28, 23.42s/it] + 90%|████████▉ | 4012/4476 [26:01:22<3:00:57, 23.40s/it] + 90%|████████▉ | 4013/4476 [26:01:46<3:00:46, 23.43s/it] + 90%|████████▉ | 4014/4476 [26:02:09<3:00:00, 23.38s/it] + 90%|████████▉ | 4015/4476 [26:02:32<2:59:19, 23.34s/it] + 90%|████████▉ | 4016/4476 [26:02:56<2:58:51, 23.33s/it] + 90%|████████▉ | 4017/4476 [26:03:19<2:58:19, 23.31s/it] + 90%|████████▉ | 4018/4476 [26:03:43<2:58:58, 23.45s/it] + 90%|████████▉ | 4019/4476 [26:04:06<2:58:14, 23.40s/it] + 90%|████████▉ | 4020/4476 [26:04:29<2:57:14, 23.32s/it] + +{'loss': 0.3784, 'learning_rate': 1.2750736573480248e-06, 'epoch': 2.69} + + 90%|████████▉ | 4020/4476 [26:04:29<2:57:14, 23.32s/it] + 90%|████████▉ | 4021/4476 [26:04:52<2:56:44, 23.31s/it] + 90%|████████▉ | 4022/4476 [26:05:16<2:55:51, 23.24s/it] + 90%|████████▉ | 4023/4476 [26:05:39<2:56:01, 23.31s/it] + 90%|████████▉ | 4024/4476 [26:06:02<2:55:27, 23.29s/it] + 90%|████████▉ | 4025/4476 [26:06:26<2:55:28, 23.34s/it] + 90%|████████▉ | 4026/4476 [26:06:49<2:55:13, 23.36s/it] + 90%|████████▉ | 4027/4476 [26:07:12<2:54:44, 23.35s/it] + 90%|████████▉ | 4028/4476 [26:07:36<2:54:44, 23.40s/it] + 90%|█████████ | 4029/4476 [26:07:59<2:54:32, 23.43s/it] + 90%|█████████ | 4030/4476 [26:08:23<2:53:54, 23.40s/it] + +{'loss': 0.3785, 'learning_rate': 1.2203358313435609e-06, 'epoch': 2.7} + + 90%|█████████ | 4030/4476 [26:08:23<2:53:54, 23.40s/it] + 90%|█████████ | 4031/4476 [26:08:46<2:53:31, 23.40s/it] + 90%|█████████ | 4032/4476 [26:09:09<2:52:39, 23.33s/it] + 90%|█████████ | 4033/4476 [26:09:33<2:52:19, 23.34s/it] + 90%|█████████ | 4034/4476 [26:09:56<2:52:13, 23.38s/it] + 90%|█████████ | 4035/4476 [26:10:20<2:52:14, 23.43s/it] + 90%|█████████ | 4036/4476 [26:10:43<2:51:04, 23.33s/it] + 90%|█████████ | 4037/4476 [26:11:06<2:50:23, 23.29s/it] + 90%|█████████ | 4038/4476 [26:11:29<2:49:41, 23.25s/it] + 90%|█████████ | 4039/4476 [26:11:53<2:50:15, 23.38s/it] + 90%|█████████ | 4040/4476 [26:12:16<2:50:03, 23.40s/it] + +{'loss': 0.3832, 'learning_rate': 1.1667694547106978e-06, 'epoch': 2.71} + + 90%|█████████ | 4040/4476 [26:12:16<2:50:03, 23.40s/it] + 90%|█████████ | 4041/4476 [26:12:40<2:50:03, 23.46s/it] + 90%|█████████ | 4042/4476 [26:13:03<2:49:20, 23.41s/it] + 90%|█████████ | 4043/4476 [26:13:27<2:49:15, 23.45s/it] + 90%|█████████ | 4044/4476 [26:13:50<2:47:53, 23.32s/it] + 90%|█████████ | 4045/4476 [26:14:13<2:47:44, 23.35s/it] + 90%|█████████ | 4046/4476 [26:14:37<2:47:14, 23.34s/it] + 90%|█████████ | 4047/4476 [26:15:00<2:47:40, 23.45s/it] + 90%|█████████ | 4048/4476 [26:15:23<2:46:41, 23.37s/it] + 90%|█████████ | 4049/4476 [26:15:47<2:46:34, 23.41s/it] + 90%|█████████ | 4050/4476 [26:16:10<2:45:31, 23.31s/it] + +{'loss': 0.3708, 'learning_rate': 1.1143771662713214e-06, 'epoch': 2.71} + + 90%|█████████ | 4050/4476 [26:16:10<2:45:31, 23.31s/it] + 91%|█████████ | 4051/4476 [26:16:33<2:44:38, 23.24s/it] + 91%|█████████ | 4052/4476 [26:16:56<2:44:16, 23.25s/it] + 91%|█████████ | 4053/4476 [26:17:19<2:43:25, 23.18s/it] + 91%|█████████ | 4054/4476 [26:17:42<2:42:54, 23.16s/it] + 91%|█████████ | 4055/4476 [26:18:06<2:42:51, 23.21s/it] + 91%|█████████ | 4056/4476 [26:18:29<2:42:43, 23.25s/it] + 91%|█████████ | 4057/4476 [26:18:52<2:41:52, 23.18s/it] + 91%|█████████ | 4058/4476 [26:19:16<2:41:56, 23.24s/it] + 91%|█████████ | 4059/4476 [26:19:39<2:41:52, 23.29s/it] + 91%|█████████ | 4060/4476 [26:20:02<2:41:31, 23.30s/it] + +{'loss': 0.3777, 'learning_rate': 1.063161547008612e-06, 'epoch': 2.72} + + 91%|█████████ | 4060/4476 [26:20:02<2:41:31, 23.30s/it] + 91%|█████████ | 4061/4476 [26:20:25<2:40:47, 23.25s/it] + 91%|█████████ | 4062/4476 [26:20:49<2:40:57, 23.33s/it] + 91%|█████████ | 4063/4476 [26:21:12<2:40:11, 23.27s/it] + 91%|█████████ | 4064/4476 [26:21:35<2:39:57, 23.30s/it] + 91%|█████████ | 4065/4476 [26:21:59<2:39:33, 23.29s/it] + 91%|█████████ | 4066/4476 [26:22:22<2:39:37, 23.36s/it] + 91%|█████████ | 4067/4476 [26:22:46<2:39:35, 23.41s/it] + 91%|█████████ | 4068/4476 [26:23:09<2:39:03, 23.39s/it] + 91%|█████████ | 4069/4476 [26:23:32<2:38:30, 23.37s/it] + 91%|█████████ | 4070/4476 [26:23:56<2:38:46, 23.46s/it] + +{'loss': 0.375, 'learning_rate': 1.0131251199399089e-06, 'epoch': 2.73} + + 91%|█████████ | 4070/4476 [26:23:56<2:38:46, 23.46s/it] + 91%|█████████ | 4071/4476 [26:24:19<2:38:16, 23.45s/it] + 91%|█████████ | 4072/4476 [26:24:43<2:37:05, 23.33s/it] + 91%|█████████ | 4073/4476 [26:25:06<2:37:33, 23.46s/it] + 91%|█████████ | 4074/4476 [26:25:30<2:36:40, 23.38s/it] + 91%|█████████ | 4075/4476 [26:25:53<2:36:46, 23.46s/it] + 91%|█████████ | 4076/4476 [26:26:16<2:36:03, 23.41s/it] + 91%|█████████ | 4077/4476 [26:26:40<2:35:34, 23.39s/it] + 91%|█████████ | 4078/4476 [26:27:03<2:34:38, 23.31s/it] + 91%|█████████ | 4079/4476 [26:27:26<2:34:22, 23.33s/it] + 91%|█████████ | 4080/4476 [26:27:50<2:33:59, 23.33s/it] + +{'loss': 0.3719, 'learning_rate': 9.642703499924216e-07, 'epoch': 2.73} + + 91%|█████████ | 4080/4476 [26:27:50<2:33:59, 23.33s/it] + 91%|█████████ | 4081/4476 [26:28:13<2:34:24, 23.45s/it] + 91%|█████████ | 4082/4476 [26:28:37<2:33:51, 23.43s/it] + 91%|█████████ | 4083/4476 [26:29:00<2:33:08, 23.38s/it] + 91%|█████████ | 4084/4476 [26:29:23<2:32:37, 23.36s/it] + 91%|█████████▏| 4085/4476 [26:29:47<2:31:57, 23.32s/it] + 91%|█████████▏| 4086/4476 [26:30:09<2:30:49, 23.20s/it] + 91%|█████████▏| 4087/4476 [26:30:33<2:31:00, 23.29s/it] + 91%|█████████▏| 4088/4476 [26:30:56<2:30:34, 23.28s/it] + 91%|█████████▏| 4089/4476 [26:31:20<2:30:43, 23.37s/it] + 91%|█████████▏| 4090/4476 [26:31:44<2:31:05, 23.49s/it] + +{'loss': 0.3776, 'learning_rate': 9.16599643881777e-07, 'epoch': 2.74} + + 91%|█████████▏| 4090/4476 [26:31:44<2:31:05, 23.49s/it] + 91%|█████████▏| 4091/4476 [26:32:07<2:31:08, 23.55s/it] + 91%|█████████▏| 4092/4476 [26:32:31<2:30:40, 23.54s/it] + 91%|█████████▏| 4093/4476 [26:32:54<2:29:04, 23.35s/it] + 91%|█████████▏| 4094/4476 [26:33:17<2:28:24, 23.31s/it] + 91%|█████████▏| 4095/4476 [26:33:40<2:27:59, 23.30s/it] + 92%|█████████▏| 4096/4476 [26:34:04<2:27:59, 23.37s/it] + 92%|█████████▏| 4097/4476 [26:34:27<2:27:30, 23.35s/it] + 92%|█████████▏| 4098/4476 [26:34:50<2:26:43, 23.29s/it] + 92%|█████████▏| 4099/4476 [26:35:13<2:26:07, 23.26s/it] + 92%|█████████▏| 4100/4476 [26:35:37<2:26:16, 23.34s/it] + +{'loss': 0.377, 'learning_rate': 8.701153499934833e-07, 'epoch': 2.75} + + 92%|█████████▏| 4100/4476 [26:35:37<2:26:16, 23.34s/it] + 92%|█████████▏| 4101/4476 [26:36:00<2:26:14, 23.40s/it] + 92%|█████████▏| 4102/4476 [26:36:24<2:25:39, 23.37s/it] + 92%|█████████▏| 4103/4476 [26:36:47<2:25:11, 23.35s/it] + 92%|█████████▏| 4104/4476 [26:37:10<2:24:42, 23.34s/it] + 92%|█████████▏| 4105/4476 [26:37:34<2:24:05, 23.30s/it] + 92%|█████████▏| 4106/4476 [26:37:57<2:23:36, 23.29s/it] + 92%|█████████▏| 4107/4476 [26:38:20<2:23:13, 23.29s/it] + 92%|█████████▏| 4108/4476 [26:38:44<2:23:06, 23.33s/it] + 92%|█████████▏| 4109/4476 [26:39:07<2:22:31, 23.30s/it] + 92%|█████████▏| 4110/4476 [26:39:30<2:22:40, 23.39s/it] + +{'loss': 0.3759, 'learning_rate': 8.248197582672395e-07, 'epoch': 2.75} + + 92%|█████████▏| 4110/4476 [26:39:30<2:22:40, 23.39s/it] + 92%|█████████▏| 4111/4476 [26:39:54<2:23:02, 23.51s/it] + 92%|█████████▏| 4112/4476 [26:40:17<2:22:00, 23.41s/it] + 92%|█████████▏| 4113/4476 [26:40:41<2:21:19, 23.36s/it] + 92%|█████████▏| 4114/4476 [26:41:04<2:20:49, 23.34s/it] + 92%|█████████▏| 4115/4476 [26:41:27<2:20:34, 23.36s/it] + 92%|█████████▏| 4116/4476 [26:41:50<2:19:45, 23.29s/it] + 92%|█████████▏| 4117/4476 [26:42:14<2:18:57, 23.22s/it] + 92%|█████████▏| 4118/4476 [26:42:37<2:18:38, 23.24s/it] + 92%|█████████▏| 4119/4476 [26:43:00<2:18:19, 23.25s/it] + 92%|█████████▏| 4120/4476 [26:43:24<2:18:21, 23.32s/it] + +{'loss': 0.3727, 'learning_rate': 7.807151000841118e-07, 'epoch': 2.76} + + 92%|█████████▏| 4120/4476 [26:43:24<2:18:21, 23.32s/it] + 92%|█████████▏| 4121/4476 [26:43:47<2:18:23, 23.39s/it] + 92%|█████████▏| 4122/4476 [26:44:11<2:18:08, 23.41s/it] + 92%|█████████▏| 4123/4476 [26:44:34<2:17:42, 23.41s/it] + 92%|█████████▏| 4124/4476 [26:44:57<2:17:23, 23.42s/it] + 92%|█████████▏| 4125/4476 [26:45:21<2:16:42, 23.37s/it] + 92%|█████████▏| 4126/4476 [26:45:44<2:16:15, 23.36s/it] + 92%|█████████▏| 4127/4476 [26:46:07<2:15:22, 23.27s/it] + 92%|█████████▏| 4128/4476 [26:46:31<2:15:18, 23.33s/it] + 92%|█████████▏| 4129/4476 [26:46:54<2:14:46, 23.30s/it] + 92%|█████████▏| 4130/4476 [26:47:17<2:13:56, 23.23s/it] + +{'loss': 0.374, 'learning_rate': 7.378035481566181e-07, 'epoch': 2.77} + + 92%|█████████▏| 4130/4476 [26:47:17<2:13:56, 23.23s/it] + 92%|█████████▏| 4131/4476 [26:47:40<2:14:00, 23.31s/it] + 92%|█████████▏| 4132/4476 [26:48:04<2:14:22, 23.44s/it] + 92%|█████████▏| 4133/4476 [26:48:27<2:13:31, 23.36s/it] + 92%|█████████▏| 4134/4476 [26:48:51<2:13:41, 23.45s/it] + 92%|█████████▏| 4135/4476 [26:49:14<2:12:40, 23.34s/it] + 92%|█████████▏| 4136/4476 [26:49:37<2:11:59, 23.29s/it] + 92%|█████████▏| 4137/4476 [26:50:00<2:11:30, 23.28s/it] + 92%|█████████▏| 4138/4476 [26:50:23<2:10:41, 23.20s/it] + 92%|█████████▏| 4139/4476 [26:50:47<2:11:06, 23.34s/it] + 92%|█████████▏| 4140/4476 [26:51:11<2:11:20, 23.45s/it] + +{'loss': 0.3792, 'learning_rate': 6.960872164217064e-07, 'epoch': 2.77} + + 92%|█████████▏| 4140/4476 [26:51:11<2:11:20, 23.45s/it] + 93%|█████████▎| 4141/4476 [26:51:34<2:10:34, 23.39s/it] + 93%|█████████▎| 4142/4476 [26:51:57<2:09:48, 23.32s/it] + 93%|█████████▎| 4143/4476 [26:52:20<2:08:58, 23.24s/it] + 93%|█████████▎| 4144/4476 [26:52:44<2:08:55, 23.30s/it] + 93%|█████████▎| 4145/4476 [26:53:07<2:08:39, 23.32s/it] + 93%|█████████▎| 4146/4476 [26:53:30<2:08:14, 23.32s/it] + 93%|█████████▎| 4147/4476 [26:53:54<2:08:10, 23.37s/it] + 93%|█████████▎| 4148/4476 [26:54:17<2:07:41, 23.36s/it] + 93%|█████████▎| 4149/4476 [26:54:40<2:07:13, 23.34s/it] + 93%|█████████▎| 4150/4476 [26:55:04<2:06:35, 23.30s/it] + +{'loss': 0.3692, 'learning_rate': 6.555681599365926e-07, 'epoch': 2.78} + + 93%|█████████▎| 4150/4476 [26:55:04<2:06:35, 23.30s/it] + 93%|█████████▎| 4151/4476 [26:55:27<2:06:16, 23.31s/it] + 93%|█████████▎| 4152/4476 [26:55:50<2:05:57, 23.33s/it] + 93%|█████████▎| 4153/4476 [26:56:14<2:05:54, 23.39s/it] + 93%|█████████▎| 4154/4476 [26:56:37<2:05:27, 23.38s/it] + 93%|█████████▎| 4155/4476 [26:57:01<2:04:50, 23.34s/it] + 93%|█████████▎| 4156/4476 [26:57:24<2:04:26, 23.33s/it] + 93%|█████████▎| 4157/4476 [26:57:47<2:04:04, 23.34s/it] + 93%|█████████▎| 4158/4476 [26:58:10<2:03:24, 23.29s/it] + 93%|█████████▎| 4159/4476 [26:58:34<2:02:57, 23.27s/it] + 93%|█████████▎| 4160/4476 [26:58:57<2:03:23, 23.43s/it] + +{'loss': 0.3736, 'learning_rate': 6.16248374777545e-07, 'epoch': 2.79} + + 93%|█████████▎| 4160/4476 [26:58:57<2:03:23, 23.43s/it] + 93%|█████████▎| 4161/4476 [26:59:21<2:03:07, 23.45s/it] + 93%|█████████▎| 4162/4476 [26:59:44<2:02:45, 23.46s/it] + 93%|█████████▎| 4163/4476 [27:00:08<2:02:02, 23.40s/it] + 93%|█████████▎| 4164/4476 [27:00:31<2:01:31, 23.37s/it] + 93%|█████████▎| 4165/4476 [27:00:54<2:01:04, 23.36s/it] + 93%|█████████▎| 4166/4476 [27:01:18<2:00:44, 23.37s/it] + 93%|█████████▎| 4167/4476 [27:01:41<2:00:01, 23.31s/it] + 93%|█████████▎| 4168/4476 [27:02:04<2:00:05, 23.40s/it] + 93%|█████████▎| 4169/4476 [27:02:28<1:59:19, 23.32s/it] + 93%|█████████▎| 4170/4476 [27:02:51<1:58:40, 23.27s/it] + +{'loss': 0.3695, 'learning_rate': 5.781297979415456e-07, 'epoch': 2.79} + + 93%|█████████▎| 4170/4476 [27:02:51<1:58:40, 23.27s/it] + 93%|█████████▎| 4171/4476 [27:03:14<1:58:24, 23.29s/it] + 93%|█████████▎| 4172/4476 [27:03:37<1:57:23, 23.17s/it] + 93%|█████████▎| 4173/4476 [27:04:00<1:56:46, 23.13s/it] + 93%|█████████▎| 4174/4476 [27:04:23<1:56:55, 23.23s/it] + 93%|█████████▎| 4175/4476 [27:04:47<1:56:40, 23.26s/it] + 93%|█████████▎| 4176/4476 [27:05:10<1:56:04, 23.22s/it] + 93%|█████████▎| 4177/4476 [27:05:34<1:56:25, 23.36s/it] + 93%|█████████▎| 4178/4476 [27:05:57<1:55:44, 23.31s/it] + 93%|█████████▎| 4179/4476 [27:06:20<1:55:25, 23.32s/it] + 93%|█████████▎| 4180/4476 [27:06:44<1:55:22, 23.39s/it] + +{'loss': 0.3716, 'learning_rate': 5.412143072508563e-07, 'epoch': 2.8} + + 93%|█████████▎| 4180/4476 [27:06:44<1:55:22, 23.39s/it] + 93%|█████████▎| 4181/4476 [27:07:07<1:55:36, 23.52s/it] + 93%|█████████▎| 4182/4476 [27:07:31<1:54:31, 23.37s/it] + 93%|█████████▎| 4183/4476 [27:07:54<1:53:54, 23.32s/it] + 93%|█████████▎| 4184/4476 [27:08:17<1:53:45, 23.38s/it] + 93%|█████████▎| 4185/4476 [27:08:41<1:53:29, 23.40s/it] + 94%|█████████▎| 4186/4476 [27:09:04<1:53:15, 23.43s/it] + 94%|█████████▎| 4187/4476 [27:09:27<1:52:42, 23.40s/it] + 94%|█████████▎| 4188/4476 [27:09:51<1:52:48, 23.50s/it] + 94%|█████████▎| 4189/4476 [27:10:15<1:52:26, 23.51s/it] + 94%|█████████▎| 4190/4476 [27:10:38<1:52:00, 23.50s/it] + +{'loss': 0.364, 'learning_rate': 5.055037212605279e-07, 'epoch': 2.81} + + 94%|█████████▎| 4190/4476 [27:10:38<1:52:00, 23.50s/it] + 94%|█████████▎| 4191/4476 [27:11:02<1:51:19, 23.44s/it] + 94%|█████████▎| 4192/4476 [27:11:25<1:51:17, 23.51s/it] + 94%|█████████▎| 4193/4476 [27:11:49<1:51:13, 23.58s/it] + 94%|█████████▎| 4194/4476 [27:12:13<1:51:03, 23.63s/it] + 94%|█████████▎| 4195/4476 [27:12:36<1:50:25, 23.58s/it] + 94%|█████████▎| 4196/4476 [27:13:00<1:49:46, 23.52s/it] + 94%|█████████▍| 4197/4476 [27:13:23<1:48:52, 23.42s/it] + 94%|█████████▍| 4198/4476 [27:13:46<1:48:45, 23.47s/it] + 94%|█████████▍| 4199/4476 [27:14:10<1:48:36, 23.52s/it] + 94%|█████████▍| 4200/4476 [27:14:33<1:47:40, 23.41s/it] + +{'loss': 0.3707, 'learning_rate': 4.709997991688114e-07, 'epoch': 2.81} + + 94%|█████████▍| 4200/4476 [27:14:33<1:47:40, 23.41s/it][INFO|trainer.py:2939] 2023-11-13 06:38:20,840 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4200 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 06:38:20,872 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 06:38:20,872 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 06:38:20,872 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4200/added_tokens.json + + 94%|█████████▍| 4201/4476 [27:14:57<1:47:18, 23.41s/it] + 94%|█████████▍| 4202/4476 [27:15:20<1:46:58, 23.42s/it] + 94%|█████████▍| 4203/4476 [27:15:43<1:46:40, 23.45s/it] + 94%|█████████▍| 4204/4476 [27:16:07<1:46:23, 23.47s/it] + 94%|█████████▍| 4205/4476 [27:16:31<1:46:10, 23.51s/it] + 94%|█████████▍| 4206/4476 [27:16:54<1:45:29, 23.44s/it] + 94%|█████████▍| 4207/4476 [27:17:17<1:44:55, 23.40s/it] + 94%|█████████▍| 4208/4476 [27:17:40<1:44:16, 23.35s/it] + 94%|█████████▍| 4209/4476 [27:18:04<1:43:58, 23.37s/it] + 94%|█████████▍| 4210/4476 [27:18:27<1:43:27, 23.34s/it] + +{'loss': 0.3833, 'learning_rate': 4.377042407304827e-07, 'epoch': 2.82} + + 94%|█████████▍| 4210/4476 [27:18:27<1:43:27, 23.34s/it] + 94%|█████████▍| 4211/4476 [27:18:50<1:43:03, 23.33s/it] + 94%|█████████▍| 4212/4476 [27:19:13<1:42:19, 23.26s/it] + 94%|█████████▍| 4213/4476 [27:19:37<1:42:22, 23.35s/it] + 94%|█████████▍| 4214/4476 [27:20:00<1:42:00, 23.36s/it] + 94%|█████████▍| 4215/4476 [27:20:24<1:41:36, 23.36s/it] + 94%|█████████▍| 4216/4476 [27:20:48<1:41:53, 23.51s/it] + 94%|█████████▍| 4217/4476 [27:21:11<1:41:35, 23.53s/it] + 94%|█████████▍| 4218/4476 [27:21:35<1:40:59, 23.49s/it] + 94%|█████████▍| 4219/4476 [27:21:58<1:40:24, 23.44s/it] + 94%|█████████▍| 4220/4476 [27:22:21<1:39:44, 23.38s/it] + +{'loss': 0.3791, 'learning_rate': 4.0561868617312316e-07, 'epoch': 2.83} + + 94%|█████████▍| 4220/4476 [27:22:21<1:39:44, 23.38s/it] + 94%|█████████▍| 4221/4476 [27:22:44<1:39:13, 23.35s/it] + 94%|█████████▍| 4222/4476 [27:23:08<1:38:28, 23.26s/it] + 94%|█████████▍| 4223/4476 [27:23:31<1:38:01, 23.25s/it] + 94%|█████████▍| 4224/4476 [27:23:54<1:37:46, 23.28s/it] + 94%|█████████▍| 4225/4476 [27:24:18<1:37:35, 23.33s/it] + 94%|█████████▍| 4226/4476 [27:24:41<1:37:22, 23.37s/it] + 94%|█████████▍| 4227/4476 [27:25:04<1:37:03, 23.39s/it] + 94%|█████████▍| 4228/4476 [27:25:28<1:36:39, 23.39s/it] + 94%|█████████▍| 4229/4476 [27:25:51<1:36:05, 23.34s/it] + 95%|█████████▍| 4230/4476 [27:26:14<1:35:34, 23.31s/it] + +{'loss': 0.3743, 'learning_rate': 3.747447161163126e-07, 'epoch': 2.83} + + 95%|█████████▍| 4230/4476 [27:26:14<1:35:34, 23.31s/it] + 95%|█████████▍| 4231/4476 [27:26:38<1:35:28, 23.38s/it] + 95%|█████████▍| 4232/4476 [27:27:01<1:34:57, 23.35s/it] + 95%|█████████▍| 4233/4476 [27:27:25<1:34:40, 23.38s/it] + 95%|█████████▍| 4234/4476 [27:27:48<1:34:25, 23.41s/it] + 95%|█████████▍| 4235/4476 [27:28:11<1:33:27, 23.27s/it] + 95%|█████████▍| 4236/4476 [27:28:34<1:33:08, 23.28s/it] + 95%|█████████▍| 4237/4476 [27:28:58<1:32:46, 23.29s/it] + 95%|█████████▍| 4238/4476 [27:29:21<1:32:23, 23.29s/it] + 95%|█████████▍| 4239/4476 [27:29:44<1:32:02, 23.30s/it] + 95%|█████████▍| 4240/4476 [27:30:08<1:31:51, 23.36s/it] + +{'loss': 0.3759, 'learning_rate': 3.4508385149375764e-07, 'epoch': 2.84} + + 95%|█████████▍| 4240/4476 [27:30:08<1:31:51, 23.36s/it] + 95%|█████████▍| 4241/4476 [27:30:31<1:30:59, 23.23s/it] + 95%|█████████▍| 4242/4476 [27:30:54<1:31:13, 23.39s/it] + 95%|█████████▍| 4243/4476 [27:31:17<1:30:22, 23.27s/it] + 95%|█████████▍| 4244/4476 [27:31:41<1:30:00, 23.28s/it] + 95%|█████████▍| 4245/4476 [27:32:04<1:29:52, 23.34s/it] + 95%|█████████▍| 4246/4476 [27:32:27<1:29:06, 23.25s/it] + 95%|█████████▍| 4247/4476 [27:32:51<1:28:49, 23.27s/it] + 95%|█████████▍| 4248/4476 [27:33:14<1:28:12, 23.21s/it] + 95%|█████████▍| 4249/4476 [27:33:37<1:27:34, 23.15s/it] + 95%|█████████▍| 4250/4476 [27:34:00<1:27:18, 23.18s/it] + +{'loss': 0.3667, 'learning_rate': 3.166375534783717e-07, 'epoch': 2.85} + + 95%|█████████▍| 4250/4476 [27:34:00<1:27:18, 23.18s/it] + 95%|█████████▍| 4251/4476 [27:34:23<1:26:44, 23.13s/it] + 95%|█████████▍| 4252/4476 [27:34:46<1:26:16, 23.11s/it] + 95%|█████████▌| 4253/4476 [27:35:09<1:26:07, 23.17s/it] + 95%|█████████▌| 4254/4476 [27:35:33<1:25:55, 23.22s/it] + 95%|█████████▌| 4255/4476 [27:35:56<1:25:32, 23.22s/it] + 95%|█████████▌| 4256/4476 [27:36:19<1:25:18, 23.27s/it] + 95%|█████████▌| 4257/4476 [27:36:43<1:24:58, 23.28s/it] + 95%|█████████▌| 4258/4476 [27:37:06<1:25:09, 23.44s/it] + 95%|█████████▌| 4259/4476 [27:37:30<1:24:39, 23.41s/it] + 95%|█████████▌| 4260/4476 [27:37:53<1:23:55, 23.31s/it] + +{'loss': 0.3725, 'learning_rate': 2.8940722341030126e-07, 'epoch': 2.85} + + 95%|█████████▌| 4260/4476 [27:37:53<1:23:55, 23.31s/it] + 95%|█████████▌| 4261/4476 [27:38:16<1:23:46, 23.38s/it] + 95%|█████████▌| 4262/4476 [27:38:40<1:23:16, 23.35s/it] + 95%|█████████▌| 4263/4476 [27:39:03<1:23:19, 23.47s/it] + 95%|█████████▌| 4264/4476 [27:39:27<1:22:36, 23.38s/it] + 95%|█████████▌| 4265/4476 [27:39:50<1:22:22, 23.42s/it] + 95%|█████████▌| 4266/4476 [27:40:13<1:21:48, 23.38s/it] + 95%|█████████▌| 4267/4476 [27:40:37<1:21:16, 23.33s/it] + 95%|█████████▌| 4268/4476 [27:41:00<1:21:13, 23.43s/it] + 95%|█████████▌| 4269/4476 [27:41:24<1:20:49, 23.43s/it] + 95%|█████████▌| 4270/4476 [27:41:47<1:20:19, 23.40s/it] + +{'loss': 0.3733, 'learning_rate': 2.6339420272787074e-07, 'epoch': 2.86} + + 95%|█████████▌| 4270/4476 [27:41:47<1:20:19, 23.40s/it] + 95%|█████████▌| 4271/4476 [27:42:10<1:19:43, 23.33s/it] + 95%|█████████▌| 4272/4476 [27:42:34<1:19:41, 23.44s/it] + 95%|█████████▌| 4273/4476 [27:42:57<1:19:23, 23.47s/it] + 95%|█████████▌| 4274/4476 [27:43:20<1:18:41, 23.38s/it] + 96%|█████████▌| 4275/4476 [27:43:44<1:18:40, 23.48s/it] + 96%|█████████▌| 4276/4476 [27:44:08<1:18:09, 23.45s/it] + 96%|█████████▌| 4277/4476 [27:44:31<1:17:37, 23.41s/it] + 96%|█████████▌| 4278/4476 [27:44:54<1:17:02, 23.34s/it] + 96%|█████████▌| 4279/4476 [27:45:18<1:16:56, 23.44s/it] + 96%|█████████▌| 4280/4476 [27:45:41<1:16:44, 23.49s/it] + +{'loss': 0.373, 'learning_rate': 2.3859977290152935e-07, 'epoch': 2.87} + + 96%|█████████▌| 4280/4476 [27:45:41<1:16:44, 23.49s/it] + 96%|█████████▌| 4281/4476 [27:46:05<1:16:03, 23.40s/it] + 96%|█████████▌| 4282/4476 [27:46:28<1:15:38, 23.39s/it] + 96%|█████████▌| 4283/4476 [27:46:51<1:14:53, 23.28s/it] + 96%|█████████▌| 4284/4476 [27:47:14<1:14:16, 23.21s/it] + 96%|█████████▌| 4285/4476 [27:47:37<1:13:52, 23.21s/it] + 96%|█████████▌| 4286/4476 [27:48:01<1:13:43, 23.28s/it] + 96%|█████████▌| 4287/4476 [27:48:24<1:13:28, 23.33s/it] + 96%|█████████▌| 4288/4476 [27:48:47<1:12:47, 23.23s/it] + 96%|█████████▌| 4289/4476 [27:49:10<1:12:25, 23.24s/it] + 96%|█████████▌| 4290/4476 [27:49:34<1:12:18, 23.32s/it] + +{'loss': 0.3725, 'learning_rate': 2.1502515537069334e-07, 'epoch': 2.87} + + 96%|█████████▌| 4290/4476 [27:49:34<1:12:18, 23.32s/it] + 96%|█████████▌| 4291/4476 [27:49:57<1:11:55, 23.33s/it] + 96%|█████████▌| 4292/4476 [27:50:20<1:11:03, 23.17s/it] + 96%|█████████▌| 4293/4476 [27:50:43<1:10:44, 23.19s/it] + 96%|█████████▌| 4294/4476 [27:51:07<1:10:52, 23.36s/it] + 96%|█████████▌| 4295/4476 [27:51:30<1:10:24, 23.34s/it] + 96%|█████████▌| 4296/4476 [27:51:54<1:09:58, 23.32s/it] + 96%|█████████▌| 4297/4476 [27:52:17<1:09:29, 23.29s/it] + 96%|█████████▌| 4298/4476 [27:52:41<1:09:28, 23.42s/it] + 96%|█████████▌| 4299/4476 [27:53:04<1:08:52, 23.35s/it] + 96%|█████████▌| 4300/4476 [27:53:27<1:08:29, 23.35s/it] + +{'loss': 0.3729, 'learning_rate': 1.926715114835914e-07, 'epoch': 2.88} + + 96%|█████████▌| 4300/4476 [27:53:27<1:08:29, 23.35s/it] + 96%|█████████▌| 4301/4476 [27:53:50<1:08:07, 23.36s/it] + 96%|█████████▌| 4302/4476 [27:54:14<1:07:42, 23.35s/it] + 96%|█████████▌| 4303/4476 [27:54:37<1:07:36, 23.45s/it] + 96%|█████████▌| 4304/4476 [27:55:01<1:07:10, 23.43s/it] + 96%|█████████▌| 4305/4476 [27:55:24<1:06:48, 23.44s/it] + 96%|█████████▌| 4306/4476 [27:55:48<1:06:40, 23.53s/it] + 96%|█████████▌| 4307/4476 [27:56:11<1:06:05, 23.46s/it] + 96%|█████████▌| 4308/4476 [27:56:35<1:05:31, 23.40s/it] + 96%|█████████▋| 4309/4476 [27:56:58<1:05:16, 23.45s/it] + 96%|█████████▋| 4310/4476 [27:57:22<1:04:51, 23.44s/it] + +{'loss': 0.3742, 'learning_rate': 1.7153994244005766e-07, 'epoch': 2.89} + + 96%|█████████▋| 4310/4476 [27:57:22<1:04:51, 23.44s/it] + 96%|█████████▋| 4311/4476 [27:57:45<1:04:20, 23.40s/it] + 96%|█████████▋| 4312/4476 [27:58:09<1:04:16, 23.52s/it] + 96%|█████████▋| 4313/4476 [27:58:32<1:03:49, 23.49s/it] + 96%|█████████▋| 4314/4476 [27:58:56<1:03:39, 23.58s/it] + 96%|█████████▋| 4315/4476 [27:59:19<1:02:55, 23.45s/it] + 96%|█████████▋| 4316/4476 [27:59:42<1:02:18, 23.37s/it] + 96%|█████████▋| 4317/4476 [28:00:06<1:02:02, 23.41s/it] + 96%|█████████▋| 4318/4476 [28:00:29<1:01:25, 23.33s/it] + 96%|█████████▋| 4319/4476 [28:00:52<1:00:57, 23.30s/it] + 97%|█████████▋| 4320/4476 [28:01:15<1:00:31, 23.28s/it] + +{'loss': 0.3739, 'learning_rate': 1.516314892372639e-07, 'epoch': 2.89} + + 97%|█████████▋| 4320/4476 [28:01:15<1:00:31, 23.28s/it] + 97%|█████████▋| 4321/4476 [28:01:39<1:00:08, 23.28s/it] + 97%|█████████▋| 4322/4476 [28:02:02<59:45, 23.28s/it] + 97%|█████████▋| 4323/4476 [28:02:25<59:13, 23.22s/it] + 97%|█████████▋| 4324/4476 [28:02:49<59:05, 23.33s/it] + 97%|█████████▋| 4325/4476 [28:03:12<58:48, 23.37s/it] + 97%|█████████▋| 4326/4476 [28:03:36<58:37, 23.45s/it] + 97%|█████████▋| 4327/4476 [28:03:59<58:06, 23.40s/it] + 97%|█████████▋| 4328/4476 [28:04:22<57:37, 23.36s/it] + 97%|█████████▋| 4329/4476 [28:04:46<57:13, 23.36s/it] + 97%|█████████▋| 4330/4476 [28:05:09<56:37, 23.27s/it] + +{'loss': 0.3755, 'learning_rate': 1.3294713261845503e-07, 'epoch': 2.9} + + 97%|█████████▋| 4330/4476 [28:05:09<56:37, 23.27s/it] + 97%|█████████▋| 4331/4476 [28:05:32<56:18, 23.30s/it] + 97%|█████████▋| 4332/4476 [28:05:55<56:02, 23.35s/it] + 97%|█████████▋| 4333/4476 [28:06:19<55:33, 23.31s/it] + 97%|█████████▋| 4334/4476 [28:06:42<55:11, 23.32s/it] + 97%|█████████▋| 4335/4476 [28:07:05<54:47, 23.32s/it] + 97%|█████████▋| 4336/4476 [28:07:29<54:23, 23.31s/it] + 97%|█████████▋| 4337/4476 [28:07:52<53:57, 23.29s/it] + 97%|█████████▋| 4338/4476 [28:08:15<53:29, 23.26s/it] + 97%|█████████▋| 4339/4476 [28:08:39<53:17, 23.34s/it] + 97%|█████████▋| 4340/4476 [28:09:01<52:35, 23.20s/it] + +{'loss': 0.3702, 'learning_rate': 1.1548779302463231e-07, 'epoch': 2.91} + + 97%|█████████▋| 4340/4476 [28:09:01<52:35, 23.20s/it] + 97%|█████████▋| 4341/4476 [28:09:24<52:03, 23.14s/it] + 97%|█████████▋| 4342/4476 [28:09:48<51:39, 23.13s/it] + 97%|█████████▋| 4343/4476 [28:10:11<51:17, 23.14s/it] + 97%|█████████▋| 4344/4476 [28:10:34<50:52, 23.12s/it] + 97%|█████████▋| 4345/4476 [28:10:57<50:34, 23.16s/it] + 97%|█████████▋| 4346/4476 [28:11:20<50:16, 23.21s/it] + 97%|█████████▋| 4347/4476 [28:11:44<49:57, 23.23s/it] + 97%|█████████▋| 4348/4476 [28:12:07<49:33, 23.23s/it] + 97%|█████████▋| 4349/4476 [28:12:30<49:10, 23.23s/it] + 97%|█████████▋| 4350/4476 [28:12:53<48:45, 23.22s/it] + +{'loss': 0.3761, 'learning_rate': 9.92543305492033e-08, 'epoch': 2.92} + + 97%|█████████▋| 4350/4476 [28:12:53<48:45, 23.22s/it] + 97%|█████████▋| 4351/4476 [28:13:17<48:24, 23.24s/it] + 97%|█████████▋| 4352/4476 [28:13:40<48:10, 23.31s/it] + 97%|█████████▋| 4353/4476 [28:14:04<48:01, 23.43s/it] + 97%|█████████▋| 4354/4476 [28:14:27<47:33, 23.39s/it] + 97%|█████████▋| 4355/4476 [28:14:50<47:08, 23.38s/it] + 97%|█████████▋| 4356/4476 [28:15:14<46:35, 23.29s/it] + 97%|█████████▋| 4357/4476 [28:15:37<46:17, 23.34s/it] + 97%|█████████▋| 4358/4476 [28:16:01<46:05, 23.44s/it] + 97%|█████████▋| 4359/4476 [28:16:24<45:38, 23.41s/it] + 97%|█████████▋| 4360/4476 [28:16:48<45:24, 23.49s/it] + +{'loss': 0.38, 'learning_rate': 8.424754489561038e-08, 'epoch': 2.92} + + 97%|█████████▋| 4360/4476 [28:16:48<45:24, 23.49s/it] + 97%|█████████▋| 4361/4476 [28:17:11<44:53, 23.42s/it] + 97%|█████████▋| 4362/4476 [28:17:34<44:26, 23.39s/it] + 97%|█████████▋| 4363/4476 [28:17:58<44:15, 23.50s/it] + 97%|█████████▋| 4364/4476 [28:18:21<43:43, 23.43s/it] + 98%|█████████▊| 4365/4476 [28:18:45<43:24, 23.46s/it] + 98%|█████████▊| 4366/4476 [28:19:08<42:56, 23.43s/it] + 98%|█████████▊| 4367/4476 [28:19:32<42:31, 23.40s/it] + 98%|█████████▊| 4368/4476 [28:19:55<42:19, 23.51s/it] + 98%|█████████▊| 4369/4476 [28:20:19<41:46, 23.43s/it] + 98%|█████████▊| 4370/4476 [28:20:42<41:25, 23.45s/it] + +{'loss': 0.3737, 'learning_rate': 7.046817533795102e-08, 'epoch': 2.93} + + 98%|█████████▊| 4370/4476 [28:20:42<41:25, 23.45s/it] + 98%|█████████▊| 4371/4476 [28:21:05<40:56, 23.40s/it] + 98%|█████████▊| 4372/4476 [28:21:29<40:30, 23.37s/it] + 98%|█████████▊| 4373/4476 [28:21:52<40:08, 23.39s/it] + 98%|█████████▊| 4374/4476 [28:22:15<39:35, 23.29s/it] + 98%|█████████▊| 4375/4476 [28:22:38<39:11, 23.28s/it] + 98%|█████████▊| 4376/4476 [28:23:02<38:47, 23.27s/it] + 98%|█████████▊| 4377/4476 [28:23:25<38:27, 23.31s/it] + 98%|█████████▊| 4378/4476 [28:23:48<38:00, 23.27s/it] + 98%|█████████▊| 4379/4476 [28:24:12<37:45, 23.35s/it] + 98%|█████████▊| 4380/4476 [28:24:35<37:12, 23.26s/it] + +{'loss': 0.3722, 'learning_rate': 5.7916900684540366e-08, 'epoch': 2.94} + + 98%|█████████▊| 4380/4476 [28:24:35<37:12, 23.26s/it] + 98%|█████████▊| 4381/4476 [28:24:58<36:44, 23.20s/it] + 98%|█████████▊| 4382/4476 [28:25:21<36:33, 23.33s/it] + 98%|█████████▊| 4383/4476 [28:25:45<36:08, 23.31s/it] + 98%|█████████▊| 4384/4476 [28:26:08<35:50, 23.37s/it] + 98%|█████████▊| 4385/4476 [28:26:32<35:34, 23.46s/it] + 98%|█████████▊| 4386/4476 [28:26:55<35:06, 23.41s/it] + 98%|█████████▊| 4387/4476 [28:27:18<34:39, 23.37s/it] + 98%|█████████▊| 4388/4476 [28:27:42<34:16, 23.37s/it] + 98%|█████████▊| 4389/4476 [28:28:05<33:53, 23.37s/it] + 98%|█████████▊| 4390/4476 [28:28:28<33:27, 23.35s/it] + +{'loss': 0.3805, 'learning_rate': 4.6594339244479536e-08, 'epoch': 2.94} + + 98%|█████████▊| 4390/4476 [28:28:29<33:27, 23.35s/it] + 98%|█████████▊| 4391/4476 [28:28:52<33:07, 23.38s/it] + 98%|█████████▊| 4392/4476 [28:29:16<32:50, 23.46s/it] + 98%|█████████▊| 4393/4476 [28:29:39<32:28, 23.48s/it] + 98%|█████████▊| 4394/4476 [28:30:03<32:10, 23.54s/it] + 98%|█████████▊| 4395/4476 [28:30:26<31:33, 23.38s/it] + 98%|█████████▊| 4396/4476 [28:30:49<31:03, 23.29s/it] + 98%|█████████▊| 4397/4476 [28:31:12<30:38, 23.27s/it] + 98%|█████████▊| 4398/4476 [28:31:36<30:20, 23.33s/it] + 98%|█████████▊| 4399/4476 [28:31:59<29:51, 23.27s/it] + 98%|█████████▊| 4400/4476 [28:32:22<29:28, 23.27s/it] + +{'loss': 0.3736, 'learning_rate': 3.650104879719951e-08, 'epoch': 2.95} + + 98%|█████████▊| 4400/4476 [28:32:22<29:28, 23.27s/it][INFO|trainer.py:2939] 2023-11-13 07:56:09,732 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4400 +[INFO|tokenization_utils_base.py:2437] 2023-11-13 07:56:09,764 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 07:56:09,764 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4400/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 07:56:09,764 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/checkpoint-4400/added_tokens.json + + 98%|█████████▊| 4401/4476 [28:32:46<29:16, 23.42s/it] + 98%|█████████▊| 4402/4476 [28:33:09<28:55, 23.45s/it] + 98%|█████████▊| 4403/4476 [28:33:33<28:27, 23.39s/it] + 98%|█████████▊| 4404/4476 [28:33:56<28:00, 23.33s/it] + 98%|█████████▊| 4405/4476 [28:34:19<27:35, 23.31s/it] + 98%|█████████▊| 4406/4476 [28:34:42<27:12, 23.32s/it] + 98%|█████████▊| 4407/4476 [28:35:06<26:47, 23.30s/it] + 98%|█████████▊| 4408/4476 [28:35:29<26:20, 23.25s/it] + 99%|█████████▊| 4409/4476 [28:35:52<26:05, 23.37s/it] + 99%|█████████▊| 4410/4476 [28:36:16<25:42, 23.37s/it] + +{'loss': 0.37, 'learning_rate': 2.7637526564971982e-08, 'epoch': 2.96} + + 99%|█████████▊| 4410/4476 [28:36:16<25:42, 23.37s/it] + 99%|█████████▊| 4411/4476 [28:36:39<25:16, 23.33s/it] + 99%|█████████▊| 4412/4476 [28:37:02<24:51, 23.30s/it] + 99%|█████████▊| 4413/4476 [28:37:25<24:21, 23.21s/it] + 99%|█████████▊| 4414/4476 [28:37:48<23:57, 23.18s/it] + 99%|█████████▊| 4415/4476 [28:38:12<23:39, 23.26s/it] + 99%|█████████▊| 4416/4476 [28:38:35<23:20, 23.33s/it] + 99%|█████████▊| 4417/4476 [28:38:59<22:57, 23.35s/it] + 99%|█████████▊| 4418/4476 [28:39:22<22:41, 23.48s/it] + 99%|█████████▊| 4419/4476 [28:39:46<22:14, 23.41s/it] + 99%|█████████▊| 4420/4476 [28:40:09<21:49, 23.38s/it] + +{'loss': 0.374, 'learning_rate': 2.0004209188428937e-08, 'epoch': 2.96} + + 99%|█████████▊| 4420/4476 [28:40:09<21:49, 23.38s/it] + 99%|█████████▉| 4421/4476 [28:40:32<21:24, 23.36s/it] + 99%|█████████▉| 4422/4476 [28:40:56<21:07, 23.47s/it] + 99%|█████████▉| 4423/4476 [28:41:20<20:48, 23.56s/it] + 99%|█████████▉| 4424/4476 [28:41:43<20:22, 23.52s/it] + 99%|█████████▉| 4425/4476 [28:42:06<19:54, 23.43s/it] + 99%|█████████▉| 4426/4476 [28:42:30<19:30, 23.41s/it] + 99%|█████████▉| 4427/4476 [28:42:53<19:06, 23.40s/it] + 99%|█████████▉| 4428/4476 [28:43:16<18:40, 23.35s/it] + 99%|█████████▉| 4429/4476 [28:43:40<18:19, 23.40s/it] + 99%|█████████▉| 4430/4476 [28:44:03<17:58, 23.44s/it] + +{'loss': 0.3713, 'learning_rate': 1.3601472705046525e-08, 'epoch': 2.97} + + 99%|█████████▉| 4430/4476 [28:44:03<17:58, 23.44s/it] + 99%|█████████▉| 4431/4476 [28:44:27<17:32, 23.39s/it] + 99%|█████████▉| 4432/4476 [28:44:51<17:14, 23.50s/it] + 99%|█████████▉| 4433/4476 [28:45:14<16:47, 23.42s/it] + 99%|█████████▉| 4434/4476 [28:45:37<16:22, 23.39s/it] + 99%|█████████▉| 4435/4476 [28:46:00<15:54, 23.29s/it] + 99%|█████████▉| 4436/4476 [28:46:24<15:36, 23.41s/it] + 99%|█████████▉| 4437/4476 [28:46:47<15:14, 23.45s/it] + 99%|█████████▉| 4438/4476 [28:47:11<14:54, 23.55s/it] + 99%|█████████▉| 4439/4476 [28:47:34<14:27, 23.44s/it] + 99%|█████████▉| 4440/4476 [28:47:58<14:06, 23.51s/it] + +{'loss': 0.3686, 'learning_rate': 8.429632530618236e-09, 'epoch': 2.98} + + 99%|█████████▉| 4440/4476 [28:47:58<14:06, 23.51s/it] + 99%|█████████▉| 4441/4476 [28:48:22<13:44, 23.56s/it] + 99%|█████████▉| 4442/4476 [28:48:45<13:19, 23.51s/it] + 99%|█████████▉| 4443/4476 [28:49:08<12:53, 23.45s/it] + 99%|█████████▉| 4444/4476 [28:49:32<12:28, 23.38s/it] + 99%|█████████▉| 4445/4476 [28:49:55<12:01, 23.29s/it] + 99%|█████████▉| 4446/4476 [28:50:18<11:40, 23.34s/it] + 99%|█████████▉| 4447/4476 [28:50:42<11:18, 23.40s/it] + 99%|█████████▉| 4448/4476 [28:51:05<10:54, 23.37s/it] + 99%|█████████▉| 4449/4476 [28:51:29<10:32, 23.43s/it] + 99%|█████████▉| 4450/4476 [28:51:52<10:08, 23.40s/it] + +{'loss': 0.3721, 'learning_rate': 4.488943443711757e-09, 'epoch': 2.98} + + 99%|█████████▉| 4450/4476 [28:51:52<10:08, 23.40s/it] + 99%|█████████▉| 4451/4476 [28:52:15<09:43, 23.34s/it] + 99%|█████████▉| 4452/4476 [28:52:39<09:23, 23.48s/it] + 99%|█████████▉| 4453/4476 [28:53:02<08:59, 23.44s/it] +100%|█████████▉| 4454/4476 [28:53:26<08:36, 23.47s/it] +100%|█████████▉| 4455/4476 [28:53:49<08:11, 23.41s/it] +100%|█████████▉| 4456/4476 [28:54:12<07:46, 23.30s/it] +100%|█████████▉| 4457/4476 [28:54:35<07:23, 23.33s/it] +100%|█████████▉| 4458/4476 [28:54:59<07:02, 23.47s/it] +100%|█████████▉| 4459/4476 [28:55:23<06:37, 23.41s/it] +100%|█████████▉| 4460/4476 [28:55:46<06:16, 23.52s/it] + +{'loss': 0.3766, 'learning_rate': 1.779599573137336e-09, 'epoch': 2.99} + +100%|█████████▉| 4460/4476 [28:55:46<06:16, 23.52s/it] +100%|█████████▉| 4461/4476 [28:56:10<05:52, 23.53s/it] +100%|█████████▉| 4462/4476 [28:56:33<05:29, 23.53s/it] +100%|█████████▉| 4463/4476 [28:56:57<05:04, 23.44s/it] +100%|█████████▉| 4464/4476 [28:57:20<04:41, 23.43s/it] +100%|█████████▉| 4465/4476 [28:57:43<04:17, 23.40s/it] +100%|█████████▉| 4466/4476 [28:58:07<03:54, 23.43s/it] +100%|█████████▉| 4467/4476 [28:58:30<03:30, 23.37s/it] +100%|█████████▉| 4468/4476 [28:58:53<03:06, 23.31s/it] +100%|█████████▉| 4469/4476 [28:59:16<02:42, 23.25s/it] +100%|█████████▉| 4470/4476 [28:59:39<02:18, 23.12s/it] + +{'loss': 0.376, 'learning_rate': 3.017343883637835e-10, 'epoch': 3.0} + +100%|█████████▉| 4470/4476 [28:59:39<02:18, 23.12s/it] +100%|█████████▉| 4471/4476 [29:00:03<01:55, 23.19s/it] +100%|█████████▉| 4472/4476 [29:00:26<01:32, 23.18s/it] +100%|█████████▉| 4473/4476 [29:00:49<01:09, 23.28s/it] +100%|█████████▉| 4474/4476 [29:01:12<00:46, 23.22s/it] +100%|█████████▉| 4475/4476 [29:01:36<00:23, 23.25s/it] +100%|██████████| 4476/4476 [29:01:59<00:00, 23.29s/it][INFO|trainer.py:2017] 2023-11-13 08:25:46,723 >> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + + +{'train_runtime': 104521.3326, 'train_samples_per_second': 10.964, 'train_steps_per_second': 0.043, 'train_loss': 0.40742741023141216, 'epoch': 3.0} + +100%|██████████| 4476/4476 [29:01:59<00:00, 23.29s/it] +100%|██████████| 4476/4476 [29:01:59<00:00, 23.35s/it] +[INFO|trainer.py:2939] 2023-11-13 08:25:46,728 >> Saving model checkpoint to /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned +[INFO|tokenization_utils_base.py:2437] 2023-11-13 08:25:46,761 >> tokenizer config file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-11-13 08:25:46,761 >> Special tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/special_tokens_map.json +[INFO|tokenization_utils_base.py:2496] 2023-11-13 08:25:46,761 >> added tokens file saved in /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/added_tokens.json +***** train metrics ***** + epoch = 3.0 + train_loss = 0.4074 + train_runtime = 1 day, 5:02:01.33 + train_samples_per_second = 10.964 + train_steps_per_second = 0.043 +Figure saved: /home/hz/projects/Project/chatglm3-6b-32k-wenshu-finetuned/training_loss.png +11/13/2023 08:25:46 - WARNING - llmtuner.extras.ploting - No metric eval_loss to plot. +[INFO|modelcard.py:452] 2023-11-13 08:25:46,887 >> Dropping the following result as it does not have all the necessary fields: +{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} diff --git a/README copy.md b/README copy.md new file mode 100644 index 0000000000000000000000000000000000000000..55a15e1092da699720a66a9caa3375a004a51c30 --- /dev/null +++ b/README copy.md @@ -0,0 +1,58 @@ +--- +base_model: /home/hz/projects/chatglm3-6b-32k +tags: +- llama-factory +- lora +- generated_from_trainer +model-index: +- name: chatglm3-6b-32k-wenshu-finetuned + results: [] +--- + + + +# chatglm3-6b-32k-wenshu-finetuned + +This model is a fine-tuned version of [/home/hz/projects/chatglm3-6b-32k](https://huggingface.co//home/hz/projects/chatglm3-6b-32k) on the wenshu_dataset dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 4 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 256 +- total_eval_batch_size: 64 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- num_epochs: 3.0 + +### Training results + + + +### Framework versions + +- Transformers 4.34.0 +- Pytorch 2.0.1+cu117 +- Datasets 2.14.6 +- Tokenizers 0.14.1 diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..471b3dff605032713448e839208aec03bdcb8abe --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3346e13155c0d39280e75d07fe63bd525777020def5c6512c3907aaea14da10 +size 7820185 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..908713cc1dec5552cc9950bf65adcd068336abbb --- /dev/null +++ b/all_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "train_loss": 0.40742741023141216, + "train_runtime": 104521.3326, + "train_samples_per_second": 10.964, + "train_steps_per_second": 0.043 +} \ No newline at end of file diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.bin b/checkpoint-1000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..13777b2cf850346075ff139c2001a37124643757 --- /dev/null +++ b/checkpoint-1000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3342fe0de43a1853b1fdaad882567a452ec747c7bb392a7e5e2a88c0a939cc11 +size 7820185 diff --git a/checkpoint-1000/added_tokens.json b/checkpoint-1000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-1000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e008760b6770e1ea10d3cf9e049f3cde575ef70f --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3badd333c7c2ea3bb2494882b036831b3692efddb08fe380c12ae793f7d5d63 +size 15644485 diff --git a/checkpoint-1000/rng_state_0.pth b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d887d20f8c4f57bfd7df045ba11c6e29c292858 --- /dev/null +++ b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1060624c7954c3b286c1948a1dd5e1ce39c497aee826b7f77f55576e5309b4c3 +size 21687 diff --git a/checkpoint-1000/rng_state_1.pth b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ff4492c23a17621af26f4eb4a6f060cd280adad7 --- /dev/null +++ b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28602630f9bf652c63fca7559b6f34e3236d16cb19a88e14b1ae9abc3f89b7c6 +size 21687 diff --git a/checkpoint-1000/rng_state_2.pth b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..22b3e3814e2c776a83c96a856732595c777c7e8c --- /dev/null +++ b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331d1c16c6e5f215989b0d4f6f031cadc6c60d030a502ce8c93d000b402b8ad4 +size 21687 diff --git a/checkpoint-1000/rng_state_3.pth b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5eba904b391e1086829d7c8d624be6c84c1dfabc --- /dev/null +++ b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36b8b18dff0d9c7fa865aa16e2d89c59d88fad7d0bc2a1589c8a7cd422051ac8 +size 21687 diff --git a/checkpoint-1000/rng_state_4.pth b/checkpoint-1000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..330adf20b1be5c9095dfe4ae4f60a9a6617d587e --- /dev/null +++ b/checkpoint-1000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e607493ac32d6d104991335c8909b9165d6475b6001cfd544a15a014fb21aaef +size 21687 diff --git a/checkpoint-1000/rng_state_5.pth b/checkpoint-1000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5ad5bbe1e860cc6ae94f12077bb00f91a1e6eecb --- /dev/null +++ b/checkpoint-1000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f91e8dbba9f0531a6db2b77df4458c926e4f57aa448d6fc5ef1918429d742736 +size 21687 diff --git a/checkpoint-1000/rng_state_6.pth b/checkpoint-1000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d054c60c7e373dc11c3e4561b2ad47bc4a5fb68 --- /dev/null +++ b/checkpoint-1000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34901406783e82c1f339065211640375722456aa206f399ee541b75f44a6a3a1 +size 21687 diff --git a/checkpoint-1000/rng_state_7.pth b/checkpoint-1000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..c39224f4a72a1d77bf1842ced9f44419cf82db75 --- /dev/null +++ b/checkpoint-1000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22decdcd459d4cbd6fb83a5afd8d9e6edec7ee066069d318782fde025ca4c4de +size 21687 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..191f758f9d5d35f9d3f0f631072e1fc3e847e670 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:707b10eb98685e357773ca2125e0d6c1c1de2a1c4e7ededd34ea00989b0b159a +size 627 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-1000/tokenization_chatglm.py b/checkpoint-1000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-1000/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a543c2ed813c44e20bdba263dea97debb9d74216 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,619 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6701289998324678, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 9.097203781942116e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-1200/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-1200/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1200/adapter_model.bin b/checkpoint-1200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..47b6c05708353a997ed9aaec67a7e1a23bdad2dc --- /dev/null +++ b/checkpoint-1200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20c941e90bb5428a8c782b9fdd552a0a14752555acb90450f67d506fb61213e +size 7820185 diff --git a/checkpoint-1200/added_tokens.json b/checkpoint-1200/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-1200/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1047964632f6f901ecbba3c29c782848ef27e4ca --- /dev/null +++ b/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:026f20db4aa943cf7fa1f4842646d76165d84726ac3b103486469d146cba2af2 +size 15644485 diff --git a/checkpoint-1200/rng_state_0.pth b/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9a4e50eab1d0adf67594121133d103320ada132 --- /dev/null +++ b/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df9c86207da192eb45fafab4e339545a4212fcdb73911f03f77b2c74e2826efe +size 21687 diff --git a/checkpoint-1200/rng_state_1.pth b/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..54f8e948d746528928c0cb056c2c5ee4f8662ff8 --- /dev/null +++ b/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6ca0cecc54a94203b5893c79a4e9964e83f47d7fc3230251eb9aaaf8fdb015 +size 21687 diff --git a/checkpoint-1200/rng_state_2.pth b/checkpoint-1200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0937d41a72e2242ab60fcc4e79027edb71dc4a75 --- /dev/null +++ b/checkpoint-1200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9205267352ed3614f0586889fed442a66c3c59f8e7824c1af3cadb71f1fa3a +size 21687 diff --git a/checkpoint-1200/rng_state_3.pth b/checkpoint-1200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..37db8cd3e69644615ca751332a514ce14515a16e --- /dev/null +++ b/checkpoint-1200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8666e3e6876bb78a90d36299e773a4dba514962e67e8bf9d2a2acbbe9c5373 +size 21687 diff --git a/checkpoint-1200/rng_state_4.pth b/checkpoint-1200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..33bccfa30f7877caf7596e6ccfacd5e7cae59a7f --- /dev/null +++ b/checkpoint-1200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11de617b1d36bd8cf3a69ee99c9c9a4f30182d38f1f8a9b66e4482e42ec8e0a +size 21687 diff --git a/checkpoint-1200/rng_state_5.pth b/checkpoint-1200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a3f95af4cd32af1c948d4b16678a09d2638c2b2 --- /dev/null +++ b/checkpoint-1200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38a74118af5ea5a095571d00a792f8c938a69de0335db83aca2804fb6390924e +size 21687 diff --git a/checkpoint-1200/rng_state_6.pth b/checkpoint-1200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..fac4c31bb012bccb74ede3a223b6efaab2b81b41 --- /dev/null +++ b/checkpoint-1200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72aa5252e4dd8473b5383b8c74bcf72acf8ea68bf6e54d1c263d57dec2fc1dd6 +size 21687 diff --git a/checkpoint-1200/rng_state_7.pth b/checkpoint-1200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a149b6b99222f8b55a913da4a38255d71ae912e --- /dev/null +++ b/checkpoint-1200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb22b5e950a14c4a77bfad244d30f4589fd1944c2820e5f560936cc1640af0c +size 21687 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16f5501c1a3d9efaf5f9e5a3cab718a63716ccda --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ae922322abc72af5a5a4e1e96d2b6312b8af582e691b3aadd460ab4b8f1cab +size 627 diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-1200/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-1200/tokenization_chatglm.py b/checkpoint-1200/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-1200/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..699e0a08cb769a7011928409cb2659f4a3874f64 --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,739 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8041547997989613, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1.0916468228205052e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-1400/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-1400/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1400/adapter_model.bin b/checkpoint-1400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..aa94846432461906d856d6433760d765c1278322 --- /dev/null +++ b/checkpoint-1400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2adba2088f9c10df3366820a3640d294362738b7833f646fb0aada9e022509b4 +size 7820185 diff --git a/checkpoint-1400/added_tokens.json b/checkpoint-1400/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-1400/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..42f4b86f79ac22e7a7d42d510fd13ff823d09492 --- /dev/null +++ b/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6fb89a4e87a35a20a2df7a933b7b79754a58cd415cb77156d124e5cac602fb7 +size 15644485 diff --git a/checkpoint-1400/rng_state_0.pth b/checkpoint-1400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc499bfb3c4df9ed66d890d51acdd7f885461eca --- /dev/null +++ b/checkpoint-1400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5739537ca55b00510274a2ba79cfec22f0b9a491d3985ee17dd9b877690570 +size 21687 diff --git a/checkpoint-1400/rng_state_1.pth b/checkpoint-1400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e39d5a079fc4d1409892503e60cbf552c82b4d93 --- /dev/null +++ b/checkpoint-1400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d92c8eb1445e8552b529fd1362681c4741e924edb90aab657e9e302630fe997 +size 21687 diff --git a/checkpoint-1400/rng_state_2.pth b/checkpoint-1400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..dda34d247a58121da73ffd164beabce340e58676 --- /dev/null +++ b/checkpoint-1400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74a9e90208cef19506874f9ff5e268f2ddaff93f635835cb66b568e7c4d309fc +size 21687 diff --git a/checkpoint-1400/rng_state_3.pth b/checkpoint-1400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4d50274068016f70c82050629ac92e14d770ead --- /dev/null +++ b/checkpoint-1400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:929ec3bb4fa76f477b9a4c2661320e3713a65bcc8094550d5f6e23645fd57386 +size 21687 diff --git a/checkpoint-1400/rng_state_4.pth b/checkpoint-1400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..86ca737a99c30a7081298ac21e6074aee514d447 --- /dev/null +++ b/checkpoint-1400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e50089c0ffcf43456215c84044251bab83c3065f7bd1d64cf78c1b985d8c683d +size 21687 diff --git a/checkpoint-1400/rng_state_5.pth b/checkpoint-1400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c67ec2183804f08b14ca133a2ef71dbe15ce8617 --- /dev/null +++ b/checkpoint-1400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aabd18bdc7108b61055ae698c2f17136e13a7e6aeb7431d18a5b093b0f06b914 +size 21687 diff --git a/checkpoint-1400/rng_state_6.pth b/checkpoint-1400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..14a98ef7235f994d2a7a8da4216614b1457f3c42 --- /dev/null +++ b/checkpoint-1400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefee1e69eec4333c55f6ba57b9f749df16517b3e9fd80bac64fe51736096caf +size 21687 diff --git a/checkpoint-1400/rng_state_7.pth b/checkpoint-1400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..28cbcf21752816dc9c53e10611c044846d87b1db --- /dev/null +++ b/checkpoint-1400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:213a1f81be2f0840625b7b2e0c11fb77f9604ca482ba476196e81a5727c0e39c +size 21687 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f767a48715231417e8d4caaddc06053dc2247a06 --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:220077401bab4cf175308a8af4d9b4dfe1e228b8bc054134b6e26e033daa9159 +size 627 diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-1400/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-1400/tokenization_chatglm.py b/checkpoint-1400/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-1400/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-1400/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..23e822e162f0dd8b5d48115087d41ac7ed883205 --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,859 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9381805997654549, + "eval_steps": 500, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1.2735452565290877e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-1600/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-1600/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1600/adapter_model.bin b/checkpoint-1600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..868ed056396e3cb7716aabd2c80834450ac91375 --- /dev/null +++ b/checkpoint-1600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b6cc94d26c89d350ade553a13e985804160046af6912a92087e583012e8976 +size 7820185 diff --git a/checkpoint-1600/added_tokens.json b/checkpoint-1600/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-1600/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f2d322c458d1dcdbb05f7089d3a37e9b5f1f8e0 --- /dev/null +++ b/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a10d3561d5b275356df7426d491fee5f84191606958ca2a954f8fbc617d2d80 +size 15644485 diff --git a/checkpoint-1600/rng_state_0.pth b/checkpoint-1600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8fc64388b70702417eacccb7ff572fd3ab88adb --- /dev/null +++ b/checkpoint-1600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06971094002591248ef2c745e68c7de46262aa999e1f1c8c81c3aab22048adb +size 21687 diff --git a/checkpoint-1600/rng_state_1.pth b/checkpoint-1600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ccb3c17578135a6523edfe586f21398deed17ed --- /dev/null +++ b/checkpoint-1600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefce07aee47e781dd7b5f36f9786f7495104bbb801d5604832390257bf772cb +size 21687 diff --git a/checkpoint-1600/rng_state_2.pth b/checkpoint-1600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..38eb600494df2bf01f54e9d5030ee8d119c99943 --- /dev/null +++ b/checkpoint-1600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60087a87498fd428146c6f64544c2d060de32feeebc2e6907eeefbabd858b937 +size 21687 diff --git a/checkpoint-1600/rng_state_3.pth b/checkpoint-1600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b3243c44d0c8a691198bfa307daa4ed76234a5f --- /dev/null +++ b/checkpoint-1600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1825b1f237a2cc52c2bcdf5768278c5de2471e78e6e949b4675a87502a742df +size 21687 diff --git a/checkpoint-1600/rng_state_4.pth b/checkpoint-1600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..00887b97c5aff4fe293d98c6e95935d5640cec35 --- /dev/null +++ b/checkpoint-1600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70272e477ad63b100f0d34a2a4f68cfff94de04cd75bfcc7393ac4b9f04bd86d +size 21687 diff --git a/checkpoint-1600/rng_state_5.pth b/checkpoint-1600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c3afcc3bc2cf951dd92877957ae04e9b8ef2b88 --- /dev/null +++ b/checkpoint-1600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0c8a524f961d646542e8e59b7d03625822c17baa539fb5c8ad1be470b159bc2 +size 21687 diff --git a/checkpoint-1600/rng_state_6.pth b/checkpoint-1600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b63125f3507293f96c25a217f472d4f950e12117 --- /dev/null +++ b/checkpoint-1600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:578e8f1ceaba3e0b59ad33603535d5734a2ce3c6d07912226d47c131ce5f140b +size 21687 diff --git a/checkpoint-1600/rng_state_7.pth b/checkpoint-1600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..dbc5ad2ff59486e2cf4e2827d331f546d298b2a7 --- /dev/null +++ b/checkpoint-1600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d24ced1073b7702b59fbee699c23a71f747a7a1ae5003389d66c36846e2acee +size 21687 diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5711ca57a5e056c1375c2f5960e76e7d0f62e47a --- /dev/null +++ b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f5ce56a566b122a18335e8e591f39466cb608da353f076727c113998f2be52 +size 627 diff --git a/checkpoint-1600/special_tokens_map.json b/checkpoint-1600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-1600/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-1600/tokenization_chatglm.py b/checkpoint-1600/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-1600/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1600/tokenizer.model b/checkpoint-1600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-1600/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b54c727f58502dc529e0e460161ed819ab9fe434 --- /dev/null +++ b/checkpoint-1600/trainer_state.json @@ -0,0 +1,979 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0722063997319484, + "eval_steps": 500, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1.4555917257344549e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-1800/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-1800/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1800/adapter_model.bin b/checkpoint-1800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2e79e0a4350081b3e7a627027825c9aa380041bf --- /dev/null +++ b/checkpoint-1800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4a3231f80393dab65a5c6f38257c52b9af14bd269d7b3bafa6f6c3d9c73581 +size 7820185 diff --git a/checkpoint-1800/added_tokens.json b/checkpoint-1800/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-1800/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b86a514e3b2fc25f91c0759e36cabebb6382fc5 --- /dev/null +++ b/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4908057ad26449ad17956a4fbaaf1175b499a59d3cd2b7878d7e759522fd465 +size 15644485 diff --git a/checkpoint-1800/rng_state_0.pth b/checkpoint-1800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3615070b2f4e540d36b965d8c34e823f4ad7b02 --- /dev/null +++ b/checkpoint-1800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de5c0bef98024985fde5c74ebcd4b3cfddda4705bda1c72d7c0fcf8eda35407 +size 21687 diff --git a/checkpoint-1800/rng_state_1.pth b/checkpoint-1800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4449fee2b14b49732147dc827e2aaaffd705b4a --- /dev/null +++ b/checkpoint-1800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce7bf3b28a89a9c81ae066754319c82510d248e2957830dfe43e431f4c770c5b +size 21687 diff --git a/checkpoint-1800/rng_state_2.pth b/checkpoint-1800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..49cedca144ea82c6a9a99706251cedd9e93df2b3 --- /dev/null +++ b/checkpoint-1800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c3c8726800b36ae1571db20e3adc2a07787486f4722436db01ee6d1b6599794 +size 21687 diff --git a/checkpoint-1800/rng_state_3.pth b/checkpoint-1800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d105be60fdd54085da88a2e549b8dd107ebaa46a --- /dev/null +++ b/checkpoint-1800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8422ed8ea33f7d4684bdbb9fe0f99388f89f0f8e2439a3876159b9fd9420a3a +size 21687 diff --git a/checkpoint-1800/rng_state_4.pth b/checkpoint-1800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..2632807a190ecc369a73561c4be911be49833640 --- /dev/null +++ b/checkpoint-1800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683da63ebd95f410c85f5c5b51983c16a7a544c463a613e2648062ce11953e03 +size 21687 diff --git a/checkpoint-1800/rng_state_5.pth b/checkpoint-1800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..701aeebcb256a467dcb3c81b04d2bf65fb6c1b2d --- /dev/null +++ b/checkpoint-1800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b6454a20db87446d30b004332ad95c58a133d92bb485eff86ca54ad68d5378c +size 21687 diff --git a/checkpoint-1800/rng_state_6.pth b/checkpoint-1800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5196e9ae1fd8899dbb033bfa08212d89106f3834 --- /dev/null +++ b/checkpoint-1800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36720dc9bcd5c60e027810f900ec52e6d64f146ade1fb9eb070433c0ca19cf8 +size 21687 diff --git a/checkpoint-1800/rng_state_7.pth b/checkpoint-1800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d8468e5009d801e870570bdbb27787c4456e131 --- /dev/null +++ b/checkpoint-1800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c9c0515a2ac990b1abaa79e04b82e21d16ed89bf1e9912ecd022fe672f682c6 +size 21687 diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2753a0c773e3e6bab79a1a443b79a5305f9316a1 --- /dev/null +++ b/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f440f5dc6b69ab8fba9ee36a41a84be953b87f90d3096ea880f6250d6322521 +size 627 diff --git a/checkpoint-1800/special_tokens_map.json b/checkpoint-1800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-1800/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-1800/tokenization_chatglm.py b/checkpoint-1800/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-1800/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1800/tokenizer.model b/checkpoint-1800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-1800/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..005fcfb9f9126f2b068eb4c85fba24613804777e --- /dev/null +++ b/checkpoint-1800/trainer_state.json @@ -0,0 +1,1099 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.206232199698442, + "eval_steps": 500, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1.6374870576176562e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.bin b/checkpoint-200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..345b58dcadc3c3feffce90f99398a8985cd54973 --- /dev/null +++ b/checkpoint-200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8121fad51e0e04c3242f3fece4241cf2a342132a3dd3d918f59999283424b75 +size 7820185 diff --git a/checkpoint-200/added_tokens.json b/checkpoint-200/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-200/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..acee8376705ece9ecadc0d05758a942ff99ea44a --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e476dea0ff690e41f2b669e8e462dc67b5438e144a2b9931c07884737940323 +size 15644485 diff --git a/checkpoint-200/rng_state_0.pth b/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8fbde54f01e80fe0f2feacd69ee34981df536cf --- /dev/null +++ b/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09dd66a3fea6fd24dd2a748bbb84b77ffec71c9f5eb55961abf8edea88d7ce59 +size 21687 diff --git a/checkpoint-200/rng_state_1.pth b/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..472d6be29fd04d1f1465adf8fb35f6a42bab07a3 --- /dev/null +++ b/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc9eea11d57071ba8dc29f878293b27d3a4fe6deec15766d6da23290bb30ee22 +size 21687 diff --git a/checkpoint-200/rng_state_2.pth b/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a6d2613f6cf1fc7a915e0166da28d372c924dce --- /dev/null +++ b/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983b7426a2fad93fd64db140ee4aafeb01d357a906f38935e6f9f8d5db08d16c +size 21687 diff --git a/checkpoint-200/rng_state_3.pth b/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c4a711aaddf275405d21d7b4d395bf80f762e230 --- /dev/null +++ b/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e290328de2eb330817da980a8bfd6f0f4e8f978fd26b7afae3a9a3f4921dd04f +size 21687 diff --git a/checkpoint-200/rng_state_4.pth b/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c2c3dfb23805f106ad744a39e1632e05156f1b7f --- /dev/null +++ b/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2f4d42692cd01c7ee0e72784db7895d407e5fb1a880fcb06d44e22d01af652 +size 21687 diff --git a/checkpoint-200/rng_state_5.pth b/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..701473b141ccbc3515ea1961b714c2a17e70764f --- /dev/null +++ b/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:027dfecd2adeee2c7a5884ab9a86d8c52b19bfa266e219aa392f4842c0676b0b +size 21687 diff --git a/checkpoint-200/rng_state_6.pth b/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae4c3f03fe053c2bf451d61afaa082778554a300 --- /dev/null +++ b/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b537c704e70c0bc32846d88d7f7c4936ee10cde229ae4b17cdb9a11a32e775ba +size 21687 diff --git a/checkpoint-200/rng_state_7.pth b/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c2685f11c7ae45bd41e776151f3477a132fe451 --- /dev/null +++ b/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f464417c0e08423a9b5df49521234577c2fa1430f64cdd990fbc0a017ccf599 +size 21687 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d78dbd6654be3614011c6ea1c4808cb853f21eb --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f015aca50758d7ce7e47126426a3f6d232fbfcb245263d634f778a05edef960 +size 627 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-200/tokenization_chatglm.py b/checkpoint-200/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-200/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ced0e32f354e9591ec97d39ba7077126bd80ee55 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,139 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.13402579996649355, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1.81903539565704e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-2000/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-2000/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.bin b/checkpoint-2000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8580c760aa20b5baa8b4439da23bcf90c915756e --- /dev/null +++ b/checkpoint-2000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a47c1b073b5f4a99a1cf60f15a9e8b3f2400b6286d546548cd6b053a5eb5d4 +size 7820185 diff --git a/checkpoint-2000/added_tokens.json b/checkpoint-2000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-2000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..85d4ede006f4a3b4826b7d0b8afd2a7bdc64e5e2 --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a7dd95968af7f40c3ccd04a7239f2d728c49bbd7182973e7ec034f8d94f908 +size 15644485 diff --git a/checkpoint-2000/rng_state_0.pth b/checkpoint-2000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba51ed116fe08d7aadc5577b55c4b9f3449f49cb --- /dev/null +++ b/checkpoint-2000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb082ef15a29d3e27a8895593ba47428654ff1836e0c76040e8cec462f66d6e +size 21687 diff --git a/checkpoint-2000/rng_state_1.pth b/checkpoint-2000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c2708cb655ede59b9f119dbb08b0b407b5ac545 --- /dev/null +++ b/checkpoint-2000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d919253616335e173cc375eb85124fd2c985a0fe7890d821293dcb0f9e789fb9 +size 21687 diff --git a/checkpoint-2000/rng_state_2.pth b/checkpoint-2000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..76fac84c61ffd11ee5e25fab71e2d6cf6c7201e9 --- /dev/null +++ b/checkpoint-2000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e1157648cfb133d706338d929a49a95363b0f992989a7177039948bb3ad773 +size 21687 diff --git a/checkpoint-2000/rng_state_3.pth b/checkpoint-2000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..74528de6820b35d079db0e97c4db35050019b32a --- /dev/null +++ b/checkpoint-2000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0bd77d0a1a86a4c937d8a11c67b6d4eb512c380bc2054687f3dc3d11cc37e23 +size 21687 diff --git a/checkpoint-2000/rng_state_4.pth b/checkpoint-2000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..3da1b732d5f2294c61dd7c36b17ee0a456027887 --- /dev/null +++ b/checkpoint-2000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a326e72b356aab407aa191e65ecc531041da30ca95201c1c7b01cfc0cc3695ce +size 21687 diff --git a/checkpoint-2000/rng_state_5.pth b/checkpoint-2000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..431f4febdba71552d981e9c916860d6f21b06140 --- /dev/null +++ b/checkpoint-2000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e889fbeb214e292401d2d55da3af515cff94f0b02c732cd988de25e11e8f4f88 +size 21687 diff --git a/checkpoint-2000/rng_state_6.pth b/checkpoint-2000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d4fe7394076a7021890f12e2eca7dc5440be606 --- /dev/null +++ b/checkpoint-2000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:743e19519cc5db999d5556803073eb198bc1c1109b6edf2e7517e655283c454b +size 21687 diff --git a/checkpoint-2000/rng_state_7.pth b/checkpoint-2000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..a333414437feb2697c6a5a872e84d739e0776bf9 --- /dev/null +++ b/checkpoint-2000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ebcab3be17c0757640535b664064ac17b5497c2df4bb317577fc0b35367852c +size 21687 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dea863f860d23a8f0872f8e83b890cc0bd3bd3b --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf18788016afeaf582382f00bbbee5c2506519fe4950387802e85b12f9837204 +size 627 diff --git a/checkpoint-2000/special_tokens_map.json b/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-2000/tokenization_chatglm.py b/checkpoint-2000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-2000/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2000/tokenizer.model b/checkpoint-2000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..54226f786960ec11e18f4a84893fcf0f8bd92a6f --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,1219 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3402579996649355, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1.8194055788882821e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-2200/README.md b/checkpoint-2200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-2200/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-2200/adapter_config.json b/checkpoint-2200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-2200/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-2200/adapter_model.bin b/checkpoint-2200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..da354aedf208febdd772a42405809837b3dd1acb --- /dev/null +++ b/checkpoint-2200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85c4bda829f4d4f968089a242c491769214f23a7a511fdee1dd378f775fb0fa1 +size 7820185 diff --git a/checkpoint-2200/added_tokens.json b/checkpoint-2200/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-2200/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-2200/optimizer.pt b/checkpoint-2200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7703d122f08cd83c297d537ec1a63ea6bcb807fb --- /dev/null +++ b/checkpoint-2200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8247712027ba312b19bc3287ad0288a2dc2438a37a90571fb532a555d7f2df71 +size 15644485 diff --git a/checkpoint-2200/rng_state_0.pth b/checkpoint-2200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6bd5e631e550ec93222b944f6d532b31ad621beb --- /dev/null +++ b/checkpoint-2200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6f8af19b0926b69283255a49255ed684c32241051c1b9e811d1eae93b0804c6 +size 21687 diff --git a/checkpoint-2200/rng_state_1.pth b/checkpoint-2200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9cf50184f3a088e585a214ec34cfbc5990a3cccc --- /dev/null +++ b/checkpoint-2200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bfd6e908128b8c2f5d7bd61761abce9bf1925cc413e0a47f4caca22379767a5 +size 21687 diff --git a/checkpoint-2200/rng_state_2.pth b/checkpoint-2200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..38b6a9e8ab8686d08bacd32ced486c68b7b0bbe5 --- /dev/null +++ b/checkpoint-2200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7a1efb12d475fd9affe80f06856a4067eaa8dc3ad1872baa2631e1c3e8d261 +size 21687 diff --git a/checkpoint-2200/rng_state_3.pth b/checkpoint-2200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..06220ff8a7c4805d02918699d1c6ed3ba310bfb5 --- /dev/null +++ b/checkpoint-2200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f87657d9ad6df8d1863dbc60168578e1084b94eb3eb9f6d7317e6ffc2c697cde +size 21687 diff --git a/checkpoint-2200/rng_state_4.pth b/checkpoint-2200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ed6af4a5f9aba06a3218cc82a9ee48c94369a11 --- /dev/null +++ b/checkpoint-2200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2db96f1e8d9aa5a36b75d1a88aaf2ab4e81c82bd086123138f6f907244f486a +size 21687 diff --git a/checkpoint-2200/rng_state_5.pth b/checkpoint-2200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd6af75865d1ca7b0cc6f4756860c866647501f8 --- /dev/null +++ b/checkpoint-2200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b64a43d6ec2cf176f1e45887b2b7fb83a6df623fbeb311b3ba1865fd7650bc1 +size 21687 diff --git a/checkpoint-2200/rng_state_6.pth b/checkpoint-2200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..88459944c983c31434247c6b7eb89d8e98ef17b5 --- /dev/null +++ b/checkpoint-2200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:706c19df9ab656348a572785698aabbe17f0c15a42153dbcdf52429a2b0383f1 +size 21687 diff --git a/checkpoint-2200/rng_state_7.pth b/checkpoint-2200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..a039613353fa64a7256afb77c7d694efc84060c7 --- /dev/null +++ b/checkpoint-2200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1825d3afacc077d94043f161cfa305e9adeb17bab04ff14538da40ffd9e44588 +size 21687 diff --git a/checkpoint-2200/scheduler.pt b/checkpoint-2200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9f5906359f97673a13972a359cfd4eed2b422ee --- /dev/null +++ b/checkpoint-2200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2521e72097db26e0adf55b0dd1edbfa467e58ae2788c3f900b26c5d294c313f5 +size 627 diff --git a/checkpoint-2200/special_tokens_map.json b/checkpoint-2200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-2200/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-2200/tokenization_chatglm.py b/checkpoint-2200/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-2200/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2200/tokenizer.model b/checkpoint-2200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2200/tokenizer_config.json b/checkpoint-2200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-2200/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-2200/trainer_state.json b/checkpoint-2200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..141c37bf4e3595f3344e0ab88853138e3c95e3fc --- /dev/null +++ b/checkpoint-2200/trainer_state.json @@ -0,0 +1,1339 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.474283799631429, + "eval_steps": 500, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 2.001340058539393e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2200/training_args.bin b/checkpoint-2200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-2200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-2400/README.md b/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-2400/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-2400/adapter_config.json b/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-2400/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-2400/adapter_model.bin b/checkpoint-2400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..11921aa7943f9dd1160d3da0391f47aff7413408 --- /dev/null +++ b/checkpoint-2400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ab8010adace31c0c52a6caeedcce6fa5775662e69cecdd01b4169109f0e20ae +size 7820185 diff --git a/checkpoint-2400/added_tokens.json b/checkpoint-2400/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-2400/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-2400/optimizer.pt b/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f89c268d67fb4cebba99d58868b6c465d1932f2d --- /dev/null +++ b/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bea873dbfa0ac5c309ad5eae9e408866a482dedc41e14693a446ef80ffa537 +size 15644485 diff --git a/checkpoint-2400/rng_state_0.pth b/checkpoint-2400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..dedf548e5f11f0f879737c515c6cbacd136589dc --- /dev/null +++ b/checkpoint-2400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1360783928bbe2eb9de9cbabb52576421f2d9fa225f43085f117f0761d9f55de +size 21687 diff --git a/checkpoint-2400/rng_state_1.pth b/checkpoint-2400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f51680cf73b7c49413f8fde940c420a3e07583d --- /dev/null +++ b/checkpoint-2400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f5e42c91f0e0eb51ffe4fc04fd3c0448fcbb00b62d05b294808a8f4ccf1b6a +size 21687 diff --git a/checkpoint-2400/rng_state_2.pth b/checkpoint-2400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1afb0c4abb11e06abeabbbc0cbf967f4a70671ef --- /dev/null +++ b/checkpoint-2400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e1739bfea79578e1c6d956d3fe321864a4bc57857226188028a8a6cb808c6ff +size 21687 diff --git a/checkpoint-2400/rng_state_3.pth b/checkpoint-2400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..915fc719e21267f58c8e3d1fa367414123ec7539 --- /dev/null +++ b/checkpoint-2400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf67aaf6f059fea6fbfe831dcf32f4a55cb265c0157375079349509200079d0 +size 21687 diff --git a/checkpoint-2400/rng_state_4.pth b/checkpoint-2400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa008b1d77d47f196c7d5b57ce789311d19aad0d --- /dev/null +++ b/checkpoint-2400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e5f66405cd6ae5baf33a7dcb9981319c23fd70f5ebe35be32fd86f96b7a9e4 +size 21687 diff --git a/checkpoint-2400/rng_state_5.pth b/checkpoint-2400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..600359eee3d767cc1691c1c989d441d741550488 --- /dev/null +++ b/checkpoint-2400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ff571971b7bc04489be0564b8be10bde67bad2d9ab664a4c51898d750fdc574 +size 21687 diff --git a/checkpoint-2400/rng_state_6.pth b/checkpoint-2400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c180cdb84c4ee0a5bfbd986bdc232a02c8fb19d9 --- /dev/null +++ b/checkpoint-2400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac9921f6cbebd3adb4df867313a9881737089be07a87bd30dc8068ee58082fef +size 21687 diff --git a/checkpoint-2400/rng_state_7.pth b/checkpoint-2400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b7e4e80c30ece5e05b41155f9c839b3145dad0 --- /dev/null +++ b/checkpoint-2400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cddb98c9085e412b8bea067e2cdeca59565e4ff0bef0e9d01e160e6071aa95d2 +size 21687 diff --git a/checkpoint-2400/scheduler.pt b/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..04ad4d5402e8fa97b63fea2a25483dab70d41c00 --- /dev/null +++ b/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b914258e6f9ec4f9613e8b986c144bc6f46da4330e6ee18fe51c04613ace56d4 +size 627 diff --git a/checkpoint-2400/special_tokens_map.json b/checkpoint-2400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-2400/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-2400/tokenization_chatglm.py b/checkpoint-2400/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-2400/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2400/tokenizer.model b/checkpoint-2400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2400/tokenizer_config.json b/checkpoint-2400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-2400/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-2400/trainer_state.json b/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..67fc1616f2b7476edafe90ebb1fff5fc8ed666f8 --- /dev/null +++ b/checkpoint-2400/trainer_state.json @@ -0,0 +1,1459 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6083095995979226, + "eval_steps": 500, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 2.183284527425441e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2400/training_args.bin b/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-2600/README.md b/checkpoint-2600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-2600/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-2600/adapter_config.json b/checkpoint-2600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-2600/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-2600/adapter_model.bin b/checkpoint-2600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..23860eee2134df31c10168bade973f91374270aa --- /dev/null +++ b/checkpoint-2600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5a8d2c67019200ff5875c6ec4ee89d5c579943e3d3a5c35633ae1223a3972e7 +size 7820185 diff --git a/checkpoint-2600/added_tokens.json b/checkpoint-2600/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-2600/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-2600/optimizer.pt b/checkpoint-2600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f685b9caa036bfa18e11498be018d80c9812193e --- /dev/null +++ b/checkpoint-2600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f293e1a7cdcc1616fd2fcd1ddbc18065473565a52fe8c1745eb3d9a3dbbf2c3 +size 15644485 diff --git a/checkpoint-2600/rng_state_0.pth b/checkpoint-2600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd19b7580aecb823cc46d5707ea7db7328eb559b --- /dev/null +++ b/checkpoint-2600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3f9004456958c9a7682d1ca3d79a00a923d9530381f01da37e12786cd358cd8 +size 21687 diff --git a/checkpoint-2600/rng_state_1.pth b/checkpoint-2600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1eedd3f68168dc30bd5497eedfc2b765b9d00f32 --- /dev/null +++ b/checkpoint-2600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb23ee222d30667df98a8d77b4fd35aef229b4e2bcbd658db0559f5d9da26f11 +size 21687 diff --git a/checkpoint-2600/rng_state_2.pth b/checkpoint-2600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..31b69eb6e240e8e7fd97d330a79cab0cad2d2089 --- /dev/null +++ b/checkpoint-2600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e186ec2c59eecc657fbf828e5a999e8b0c2089b209e346bc558d5fb060562a +size 21687 diff --git a/checkpoint-2600/rng_state_3.pth b/checkpoint-2600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d90a3e8d722a57d28e8e7366455da038cb17f306 --- /dev/null +++ b/checkpoint-2600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2734c12e4bca980ef6f6ed91fa3b03ca76feeb3dbaa394c6df6892f6f6c838f4 +size 21687 diff --git a/checkpoint-2600/rng_state_4.pth b/checkpoint-2600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..88bce2da801ad920784fce8054ed59e410a0da26 --- /dev/null +++ b/checkpoint-2600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a897704dfde5290ed0287b8d17da7e8be9ca9c661bff054b3f194799bd9d9756 +size 21687 diff --git a/checkpoint-2600/rng_state_5.pth b/checkpoint-2600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..540ea9c20bda8aa838ff02c0688c27fb0b404220 --- /dev/null +++ b/checkpoint-2600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb0f4c32bc61ed21cfdd5fb56c7ed054db9be7e0b5537d43281777b437f55cc2 +size 21687 diff --git a/checkpoint-2600/rng_state_6.pth b/checkpoint-2600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c01184eda38311ccdfa80314d198fc6a9c5d671 --- /dev/null +++ b/checkpoint-2600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:411e840fc3345c9a833c69d73f734452f8715dd159868f701dcc6fb650dd6f7a +size 21687 diff --git a/checkpoint-2600/rng_state_7.pth b/checkpoint-2600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..761aa94629b968945e0db26cf9f4ca4491144eec --- /dev/null +++ b/checkpoint-2600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22308cdee4d04080fcfebfffef680374ec4dfcc17528b58eee564c3a8b8b40d3 +size 21687 diff --git a/checkpoint-2600/scheduler.pt b/checkpoint-2600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..da1699b352ab6a9701719611caad9562ecce1a31 --- /dev/null +++ b/checkpoint-2600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4238873c45789b3ca53e774ab925769aac6f2c49af2791ac1e67e5fd9c343e9c +size 627 diff --git a/checkpoint-2600/special_tokens_map.json b/checkpoint-2600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-2600/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-2600/tokenization_chatglm.py b/checkpoint-2600/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-2600/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2600/tokenizer.model b/checkpoint-2600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2600/tokenizer_config.json b/checkpoint-2600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-2600/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-2600/trainer_state.json b/checkpoint-2600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2fbb850b820975ef7b6ad0579b659ceaf95686af --- /dev/null +++ b/checkpoint-2600/trainer_state.json @@ -0,0 +1,1579 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7423353995644162, + "eval_steps": 500, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 2.3653836855716086e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2600/training_args.bin b/checkpoint-2600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-2600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-2800/README.md b/checkpoint-2800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-2800/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-2800/adapter_config.json b/checkpoint-2800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-2800/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-2800/adapter_model.bin b/checkpoint-2800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..187b4575998343410070ff0696f0edc17d8bdb33 --- /dev/null +++ b/checkpoint-2800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98a7b6b7badbe301f8989008430b37c6a51b5be9564f6d609925e0f107395b0f +size 7820185 diff --git a/checkpoint-2800/added_tokens.json b/checkpoint-2800/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-2800/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-2800/optimizer.pt b/checkpoint-2800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7aa079936f53d189cd2ba207d4bd019ea4c2c002 --- /dev/null +++ b/checkpoint-2800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1bf9fc038693096f7ca66797e5b883ae6530a667ca9d620b6188a64644b4a98 +size 15644485 diff --git a/checkpoint-2800/rng_state_0.pth b/checkpoint-2800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..535fa20d9c45c1e56551ca2f274401ccefeba27a --- /dev/null +++ b/checkpoint-2800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1981b6e2f235ccc0677902b870343ffca4b9a710e9cf32eef7fe7a17146379 +size 21687 diff --git a/checkpoint-2800/rng_state_1.pth b/checkpoint-2800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a44d3336cd124e26f40083dd2ce0c9c8232759a --- /dev/null +++ b/checkpoint-2800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b33ab17482e66c062cb5287a88894725c1f40e3952ca721131f56ae125d9cd1b +size 21687 diff --git a/checkpoint-2800/rng_state_2.pth b/checkpoint-2800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..80987feafcacd9e5aaa7783c322819a7deca0a64 --- /dev/null +++ b/checkpoint-2800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab1bb0c9b76745ab28714ea881dce0e6abc51f7916160a115c16d0c453d283e4 +size 21687 diff --git a/checkpoint-2800/rng_state_3.pth b/checkpoint-2800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4632ece2ad9bfbca8aa3323cf28cb48d597aa4a5 --- /dev/null +++ b/checkpoint-2800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26825c2570b598e8d3e018f09796e553a53100f8da3c42671817b877018fd846 +size 21687 diff --git a/checkpoint-2800/rng_state_4.pth b/checkpoint-2800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8397a25475c61a586b7928b344d37a4511d87255 --- /dev/null +++ b/checkpoint-2800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d04c667092dadb71e99dbdf63ac5436e28c8dc396662a1de4efb060a875ed0b +size 21687 diff --git a/checkpoint-2800/rng_state_5.pth b/checkpoint-2800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a4f96c8fb3b6c78c4a7286035a4ea136b004ad0 --- /dev/null +++ b/checkpoint-2800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670260cf15897e19d1960590cdcfdc80caf7c220aa586384b68a2203f687a7ea +size 21687 diff --git a/checkpoint-2800/rng_state_6.pth b/checkpoint-2800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1175b679e0a31eae546a151bc5d1a9209c46eda3 --- /dev/null +++ b/checkpoint-2800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cc1fc03f66e4577afa625b71d3155e483a68b6eac484da1137247f8c36ffbcd +size 21687 diff --git a/checkpoint-2800/rng_state_7.pth b/checkpoint-2800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bdf5bb3fbb807a91995a346b13e5f9d49dc1dfa0 --- /dev/null +++ b/checkpoint-2800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5947b17f823afc7f8725b506c572949b3778c8f833c638efdbd7affc7eb5ed4a +size 21687 diff --git a/checkpoint-2800/scheduler.pt b/checkpoint-2800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bb982dc43105ce3c9a375162050ab37b1ea2550 --- /dev/null +++ b/checkpoint-2800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe8e61927045448f2964c6db4870fcf2a6a654806bdac489b3f4864e6f59841 +size 627 diff --git a/checkpoint-2800/special_tokens_map.json b/checkpoint-2800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-2800/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-2800/tokenization_chatglm.py b/checkpoint-2800/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-2800/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2800/tokenizer.model b/checkpoint-2800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2800/tokenizer_config.json b/checkpoint-2800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-2800/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-2800/trainer_state.json b/checkpoint-2800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..29129b4e01f042b09b089c148fbbe99007a353e4 --- /dev/null +++ b/checkpoint-2800/trainer_state.json @@ -0,0 +1,1699 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8763611995309097, + "eval_steps": 500, + "global_step": 2800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 2.5472478497361363e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2800/training_args.bin b/checkpoint-2800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-2800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-3000/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-3000/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-3000/adapter_model.bin b/checkpoint-3000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8705dc4ffce584ec7ddcd77e2952024ebcf4bdd6 --- /dev/null +++ b/checkpoint-3000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f075c3cf66c937bea653a061f69799d03da06be7c755807d75c0d85988531674 +size 7820185 diff --git a/checkpoint-3000/added_tokens.json b/checkpoint-3000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-3000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a768bdaebe3dee670ea598fb621208d4f8404b3b --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6219526ae8c2f83628c81f6b261ad7bbb94d787eceec49be2d6aa0cd5ed782 +size 15644485 diff --git a/checkpoint-3000/rng_state_0.pth b/checkpoint-3000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0f36c16f0c42583811051d9934bc291315187a1 --- /dev/null +++ b/checkpoint-3000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9a6e3e84eae65b06cb302b4b4bead70446e2c876331412cfb1d2530b31ddadc +size 21687 diff --git a/checkpoint-3000/rng_state_1.pth b/checkpoint-3000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6eb69aa50354a59c8175f230aec6efb8f2093d49 --- /dev/null +++ b/checkpoint-3000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05cfbb56746db4d454eff7d76b8968489e0858e87f01283b661c7b83cc37ff09 +size 21687 diff --git a/checkpoint-3000/rng_state_2.pth b/checkpoint-3000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d00cab516230cfa18e979646de98cacb678737a8 --- /dev/null +++ b/checkpoint-3000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9564d56a6177212a3beb81bc312ae9129a6789d210799027fc14bc49c70c11 +size 21687 diff --git a/checkpoint-3000/rng_state_3.pth b/checkpoint-3000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..31be840c0c62e87b0624d6c4a5f6f6d9fcd69d2c --- /dev/null +++ b/checkpoint-3000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa5ab7e2c574a3f02d65dba593d2cca28960db978e4495b8ea61123b784d416 +size 21687 diff --git a/checkpoint-3000/rng_state_4.pth b/checkpoint-3000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..03eedf5b62d682fde6a530f4a1489f48c9041223 --- /dev/null +++ b/checkpoint-3000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de5532d77df314b556f587aa88b4efa5a8f30eab636b1faa0482b5dc970a2546 +size 21687 diff --git a/checkpoint-3000/rng_state_5.pth b/checkpoint-3000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..57fe296dfa719b2fa76feefb3a53836d000fc3ab --- /dev/null +++ b/checkpoint-3000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b6ea040e2eec938bf7308df5d3e7be10a2fabd0876a98f26a48d8cf907e71e +size 21687 diff --git a/checkpoint-3000/rng_state_6.pth b/checkpoint-3000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9c6dc28378f69670f526b379aa01753251641e6 --- /dev/null +++ b/checkpoint-3000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e94d2a33852ddc2a51964bfb321ea48fd6371cae86fe6ce9837a4c2631594f8 +size 21687 diff --git a/checkpoint-3000/rng_state_7.pth b/checkpoint-3000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed6d0225c1f3d39fe0b8c7b957d533d5eb747a9 --- /dev/null +++ b/checkpoint-3000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9a89b698da44207ae890fbde4310b76fc24857ad583b592f8f4f21afe2652e9 +size 21687 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3593f2042a5b825ee36fefed5be96793cb1ac1a --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b04c89cf0b6dae312913cee288fb17c2ca8a0831aa7ef116befe372d99b5c75b +size 627 diff --git a/checkpoint-3000/special_tokens_map.json b/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-3000/tokenization_chatglm.py b/checkpoint-3000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-3000/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3000/tokenizer.model b/checkpoint-3000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0747749e699f255452baffea0c41a7054dbcaeab --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,1819 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.010386999497403, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 2.7293094148925555e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-3200/README.md b/checkpoint-3200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-3200/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-3200/adapter_config.json b/checkpoint-3200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-3200/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-3200/adapter_model.bin b/checkpoint-3200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ece56d85cdf750bd0a4e901460219b51afa19f16 --- /dev/null +++ b/checkpoint-3200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:163f3ff057e98a082d2e44d040cbf764b6bcdfe9fbf65300648aa93df6d6679a +size 7820185 diff --git a/checkpoint-3200/added_tokens.json b/checkpoint-3200/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-3200/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-3200/optimizer.pt b/checkpoint-3200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ba1f790253d0472739d7e556bec214592e985c9 --- /dev/null +++ b/checkpoint-3200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15cc265b1aa557b2b1827130d246e3d6ea45c95dda0bad0284f0ee9f3e2d1e10 +size 15644485 diff --git a/checkpoint-3200/rng_state_0.pth b/checkpoint-3200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3aad839618efa10c7d668a44f1e1162f987b161 --- /dev/null +++ b/checkpoint-3200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6064b6293a80aa72a80c51edb36c012708955089e2f19744c930ed2494325d1c +size 21687 diff --git a/checkpoint-3200/rng_state_1.pth b/checkpoint-3200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..deb43352a1b2e935c6161b02daab08189eff2566 --- /dev/null +++ b/checkpoint-3200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe1b61eb0f80f1a01d61503160c34ea82002e277e1db7dc4576d32728150faa +size 21687 diff --git a/checkpoint-3200/rng_state_2.pth b/checkpoint-3200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..532c9703f211a9755e3e0a8d20075a18491faf03 --- /dev/null +++ b/checkpoint-3200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2a1efb3ca8e4b8d6ed0652de1ee4f578d99b488fea868cba61b9d9f74edb4b3 +size 21687 diff --git a/checkpoint-3200/rng_state_3.pth b/checkpoint-3200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3d1c7529fda028e9400f3da0605a86f5f7f8846 --- /dev/null +++ b/checkpoint-3200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81daaabeffc8bd75cfa62d6bb13ca132fb9e695a49ad0f7f4802407f8b2432c +size 21687 diff --git a/checkpoint-3200/rng_state_4.pth b/checkpoint-3200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e2a05e93f8965d202937ac2cd7e2d36d5db51a3 --- /dev/null +++ b/checkpoint-3200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:564c92a1ce6cbcdd21948d3253179e061a98fe6b7561fe242db6e9c3ff0160cb +size 21687 diff --git a/checkpoint-3200/rng_state_5.pth b/checkpoint-3200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b962466335c3925b3fe71e823c6d4546ac73996 --- /dev/null +++ b/checkpoint-3200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:671d151bdfb9aa8af88a2dcd96d10cd4537431cf3fe850d0b710f1de9c900453 +size 21687 diff --git a/checkpoint-3200/rng_state_6.pth b/checkpoint-3200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e354ccbea9694355e447095d4e2dd1d5298baf5 --- /dev/null +++ b/checkpoint-3200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39eb56add759872b315cb0a3b13c7d19429f9ce2327503dd764dfd888ea6a4bf +size 21687 diff --git a/checkpoint-3200/rng_state_7.pth b/checkpoint-3200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cff3099578a5356b0f8f51685b8af6744711c6f1 --- /dev/null +++ b/checkpoint-3200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffb607afae00c525b19fecdedef8aa65b3b73ec7687fceb4dd528f33cd4ece78 +size 21687 diff --git a/checkpoint-3200/scheduler.pt b/checkpoint-3200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..83eb670f065bb39826c7f754786e8349d5cc48e1 --- /dev/null +++ b/checkpoint-3200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61bc04db55b75f6c1384c4839161ed2a2b4150193627923ee618cb46c20fb5d1 +size 627 diff --git a/checkpoint-3200/special_tokens_map.json b/checkpoint-3200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-3200/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-3200/tokenization_chatglm.py b/checkpoint-3200/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-3200/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3200/tokenizer.model b/checkpoint-3200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3200/tokenizer_config.json b/checkpoint-3200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-3200/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-3200/trainer_state.json b/checkpoint-3200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0ebd6c21512b0c2b66e93958c9fcd12ef9852c22 --- /dev/null +++ b/checkpoint-3200/trainer_state.json @@ -0,0 +1,1939 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.144412799463897, + "eval_steps": 500, + "global_step": 3200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 2.911200383088984e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3200/training_args.bin b/checkpoint-3200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-3200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-3400/README.md b/checkpoint-3400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-3400/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-3400/adapter_config.json b/checkpoint-3400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-3400/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-3400/adapter_model.bin b/checkpoint-3400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ab33d5464bed306695c29bbeaec6a2f719fe69c0 --- /dev/null +++ b/checkpoint-3400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ddaef6235651477f69bda1e6a2f92624c31833b37506e199311936d2c7eb47e +size 7820185 diff --git a/checkpoint-3400/added_tokens.json b/checkpoint-3400/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-3400/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-3400/optimizer.pt b/checkpoint-3400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0c29eb1fb76775e3baf193415a7eadc6a10eaea --- /dev/null +++ b/checkpoint-3400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45a298f685c1969294337de9bf0dff672bf3f686782cbf8f187cc9224da6bd76 +size 15644485 diff --git a/checkpoint-3400/rng_state_0.pth b/checkpoint-3400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..46fb96a30c8ec22479c1e67bae0a047b69f43ca2 --- /dev/null +++ b/checkpoint-3400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c97e5b3cfe044ec8cf781218f21f041e93cb36de76ec14dfba4eadae031dd02 +size 21687 diff --git a/checkpoint-3400/rng_state_1.pth b/checkpoint-3400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dd391461d18beb59817840b1d9b637199cc208d --- /dev/null +++ b/checkpoint-3400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87301756c0c70c6035c27f3aeec13f83a8f6537109a378a7700ee46c3abec8f6 +size 21687 diff --git a/checkpoint-3400/rng_state_2.pth b/checkpoint-3400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bab98ea40da4ad1e66c725ae93c0bd72ef23b246 --- /dev/null +++ b/checkpoint-3400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7e55c19d39bb0c7db0a4f4cb79aba56e5f223362c74c43efc911565f600280b +size 21687 diff --git a/checkpoint-3400/rng_state_3.pth b/checkpoint-3400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9bd02b2bf119533e1a7743e09e8cc585d1b0ed8 --- /dev/null +++ b/checkpoint-3400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7b575ccb5d521850d91b64cba7a605dc09913953efb6e549e77513b5fdc7f94 +size 21687 diff --git a/checkpoint-3400/rng_state_4.pth b/checkpoint-3400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c6beb0d3456648029a84f7d3e2fb180253971a5b --- /dev/null +++ b/checkpoint-3400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b0cd60309874203ed1f388b7a6baaff4e1e9608ab56f1d21a503c018a36b273 +size 21687 diff --git a/checkpoint-3400/rng_state_5.pth b/checkpoint-3400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9765f0cbb6ac5bf14d15e299df57b91583cd522e --- /dev/null +++ b/checkpoint-3400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b17af76c77fc824a6b64fcbdd8ad652d6ffdcd41b7ff919d3bc4443adae1b7 +size 21687 diff --git a/checkpoint-3400/rng_state_6.pth b/checkpoint-3400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..24996a4a89053b787420cb49990450b5547444c5 --- /dev/null +++ b/checkpoint-3400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e42cdf67600ffe9e443cc4d02228b2fb693d393e0b8b7a55b91122b2acb43aa +size 21687 diff --git a/checkpoint-3400/rng_state_7.pth b/checkpoint-3400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b33926f2733bb418de03d4b211d59ea62b2a8655 --- /dev/null +++ b/checkpoint-3400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59ac2fa1e10ceecb5ef6b9a1849669333c8ef926fd94f8fe9822316a33505ea +size 21687 diff --git a/checkpoint-3400/scheduler.pt b/checkpoint-3400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b976275dbc8858c5c10c88cbe477771db45177ea --- /dev/null +++ b/checkpoint-3400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba3d88e05c1919ee4fe1372db3cba741f4fd05561961e312b1c4a9bb472abc9 +size 627 diff --git a/checkpoint-3400/special_tokens_map.json b/checkpoint-3400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-3400/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-3400/tokenization_chatglm.py b/checkpoint-3400/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-3400/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3400/tokenizer.model b/checkpoint-3400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3400/tokenizer_config.json b/checkpoint-3400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-3400/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-3400/trainer_state.json b/checkpoint-3400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2743f25a82748996a30baf9ff8a8a0cf5bc34148 --- /dev/null +++ b/checkpoint-3400/trainer_state.json @@ -0,0 +1,2059 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.27843859943039, + "eval_steps": 500, + "global_step": 3400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 3.0930648367343075e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3400/training_args.bin b/checkpoint-3400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-3400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-3600/README.md b/checkpoint-3600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-3600/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-3600/adapter_config.json b/checkpoint-3600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-3600/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-3600/adapter_model.bin b/checkpoint-3600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d14bbc77c1264fe8f145b8aa4eebb5975f5bf50f --- /dev/null +++ b/checkpoint-3600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a06680b4cfe60c9e7698a14cf60b4cb1530a325b2acd4d9b9a47425e03ded0 +size 7820185 diff --git a/checkpoint-3600/added_tokens.json b/checkpoint-3600/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-3600/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-3600/optimizer.pt b/checkpoint-3600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..260f112ef11fbaa30b8c89c303286e18c9f516d5 --- /dev/null +++ b/checkpoint-3600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d22a9359029d39e0f2f39bd4010d45e1bf5d86ff3301f30baf6598e0c9d72e7a +size 15644485 diff --git a/checkpoint-3600/rng_state_0.pth b/checkpoint-3600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b45e6eff10e66b3f49e47c5c96e54f46d32f462 --- /dev/null +++ b/checkpoint-3600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8bef2e76ebfacfc142d064b66d560fe68b06fad3d18287082701a18f4735d3 +size 21687 diff --git a/checkpoint-3600/rng_state_1.pth b/checkpoint-3600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d40d85167a1b98b29a91d959a25c7cfa1b7a4ee --- /dev/null +++ b/checkpoint-3600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e584a372dd9257efc9d570b219718ac26a98c90081d38af5a13319089f71095 +size 21687 diff --git a/checkpoint-3600/rng_state_2.pth b/checkpoint-3600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e83c6de161b343456f15ab42cfebf7d80e02a5bd --- /dev/null +++ b/checkpoint-3600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f2b35e10a2ffd976fd2fe2f7a603e212bf36509ecea5e9c6ce72706153ea25 +size 21687 diff --git a/checkpoint-3600/rng_state_3.pth b/checkpoint-3600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..55a6151c2d3806a26e6e5ce357c8b1203590be58 --- /dev/null +++ b/checkpoint-3600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c565c0c8946a999e4c6b8d8a4eba6f593ff7a0d793982a52b092bb80ebee734 +size 21687 diff --git a/checkpoint-3600/rng_state_4.pth b/checkpoint-3600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c65e151855ba567672081fa635fb6c0382dd7908 --- /dev/null +++ b/checkpoint-3600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3452ca5ffcdad2a2bd6fcee6450dd5b200140e74836fe7821b147fc7e7c14344 +size 21687 diff --git a/checkpoint-3600/rng_state_5.pth b/checkpoint-3600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d111c0b1ec0a36b592855506f878c7cc8fdebccd --- /dev/null +++ b/checkpoint-3600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e93196031225fdb509cb2f448d91defed7c6025d8fa1eed12fddb43352133f3 +size 21687 diff --git a/checkpoint-3600/rng_state_6.pth b/checkpoint-3600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f2eb8c63b76ce30f3e28a8cfded37b1a40cd57c --- /dev/null +++ b/checkpoint-3600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3235bda3d989c0bbc42a5a3ccb1c5f9507e4d1b1d573ad41dd0d6c7389f4c27a +size 21687 diff --git a/checkpoint-3600/rng_state_7.pth b/checkpoint-3600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..220ee17270fc2f407410146ee057ae556478d6b2 --- /dev/null +++ b/checkpoint-3600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b71feafc8649159a330e3a0d0c0ed5acc9ec99b6457644fc49739e59259a6fd7 +size 21687 diff --git a/checkpoint-3600/scheduler.pt b/checkpoint-3600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2740a8ddec35b94ab075a89f7d4db7e2e0b94ea9 --- /dev/null +++ b/checkpoint-3600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf1efa9327f8f075bbf90ab2507014ac09205041de44b69567a8ecaa742d947f +size 627 diff --git a/checkpoint-3600/special_tokens_map.json b/checkpoint-3600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-3600/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-3600/tokenization_chatglm.py b/checkpoint-3600/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-3600/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3600/tokenizer.model b/checkpoint-3600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3600/tokenizer_config.json b/checkpoint-3600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-3600/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-3600/trainer_state.json b/checkpoint-3600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad948265baba1bc48d4f601b0ab48fe5963c6ae2 --- /dev/null +++ b/checkpoint-3600/trainer_state.json @@ -0,0 +1,2179 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.412464399396884, + "eval_steps": 500, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + }, + { + "epoch": 2.29, + "learning_rate": 6.6890555440403015e-06, + "loss": 0.3808, + "step": 3410 + }, + { + "epoch": 2.29, + "learning_rate": 6.570042455438822e-06, + "loss": 0.3797, + "step": 3420 + }, + { + "epoch": 2.3, + "learning_rate": 6.451937275461736e-06, + "loss": 0.3739, + "step": 3430 + }, + { + "epoch": 2.31, + "learning_rate": 6.334745822283699e-06, + "loss": 0.3748, + "step": 3440 + }, + { + "epoch": 2.31, + "learning_rate": 6.2184738690667214e-06, + "loss": 0.3769, + "step": 3450 + }, + { + "epoch": 2.32, + "learning_rate": 6.103127143675832e-06, + "loss": 0.3756, + "step": 3460 + }, + { + "epoch": 2.33, + "learning_rate": 5.988711328396859e-06, + "loss": 0.3738, + "step": 3470 + }, + { + "epoch": 2.33, + "learning_rate": 5.875232059656552e-06, + "loss": 0.3676, + "step": 3480 + }, + { + "epoch": 2.34, + "learning_rate": 5.762694927744866e-06, + "loss": 0.3737, + "step": 3490 + }, + { + "epoch": 2.35, + "learning_rate": 5.651105476539623e-06, + "loss": 0.369, + "step": 3500 + }, + { + "epoch": 2.35, + "learning_rate": 5.540469203233347e-06, + "loss": 0.3723, + "step": 3510 + }, + { + "epoch": 2.36, + "learning_rate": 5.430791558062518e-06, + "loss": 0.3791, + "step": 3520 + }, + { + "epoch": 2.37, + "learning_rate": 5.322077944039039e-06, + "loss": 0.3753, + "step": 3530 + }, + { + "epoch": 2.37, + "learning_rate": 5.21433371668407e-06, + "loss": 0.3703, + "step": 3540 + }, + { + "epoch": 2.38, + "learning_rate": 5.107564183764219e-06, + "loss": 0.3781, + "step": 3550 + }, + { + "epoch": 2.39, + "learning_rate": 5.001774605030074e-06, + "loss": 0.3766, + "step": 3560 + }, + { + "epoch": 2.39, + "learning_rate": 4.8969701919570454e-06, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.4, + "learning_rate": 4.7931561074887e-06, + "loss": 0.3681, + "step": 3580 + }, + { + "epoch": 2.41, + "learning_rate": 4.690337465782366e-06, + "loss": 0.3752, + "step": 3590 + }, + { + "epoch": 2.41, + "learning_rate": 4.588519331957241e-06, + "loss": 0.3775, + "step": 3600 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 3.275050804453363e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3600/training_args.bin b/checkpoint-3600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-3600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-3800/README.md b/checkpoint-3800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-3800/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-3800/adapter_config.json b/checkpoint-3800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-3800/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-3800/adapter_model.bin b/checkpoint-3800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..518f74b3b97ba5bbadbffeea6e8bfee35e61fed8 --- /dev/null +++ b/checkpoint-3800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b83b626723749b7641c2192e5ba64d1fc9b77897c729ad0211329eade34a4b0 +size 7820185 diff --git a/checkpoint-3800/added_tokens.json b/checkpoint-3800/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-3800/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-3800/optimizer.pt b/checkpoint-3800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..377e9cdf0f9a7fb7e244bd3792af129e64503ad3 --- /dev/null +++ b/checkpoint-3800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d95ba40258b904ac3c0a30b01c9eb8f0f7b28562436068ceb7122869556d16a +size 15644485 diff --git a/checkpoint-3800/rng_state_0.pth b/checkpoint-3800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..82cc9e241f8d525a2e4246ca8129afe1b95c25a4 --- /dev/null +++ b/checkpoint-3800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bab44935c748d20d758b16ca4752de5ac45164cb9fa5da6071949f206763667 +size 21687 diff --git a/checkpoint-3800/rng_state_1.pth b/checkpoint-3800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6562020252919159e0506c5bc0d44d0d7476da5a --- /dev/null +++ b/checkpoint-3800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:404060fc2d317e3699296bf579cac08192f73e8bbe5d2b44b11f273268509ae5 +size 21687 diff --git a/checkpoint-3800/rng_state_2.pth b/checkpoint-3800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6cf1cda43a89afcd16f2eb1a77e2fa4688fd0667 --- /dev/null +++ b/checkpoint-3800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e31d5d5af68f406e04e6beb3882cc73ec1c7f7a82803b1a3eed8f078da8b0ea7 +size 21687 diff --git a/checkpoint-3800/rng_state_3.pth b/checkpoint-3800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7bc31b395af38d285c72ee172c34306127ed1acc --- /dev/null +++ b/checkpoint-3800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bddd9094b325e8a2a331b3bbbb9a55da46a0309fe568fb536d54d10379f0dbb7 +size 21687 diff --git a/checkpoint-3800/rng_state_4.pth b/checkpoint-3800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea14a054731da010ff89e98944ecaa17fd4d1808 --- /dev/null +++ b/checkpoint-3800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe80c773266fe7ce509faffbb3b77af09b42b5b8949dc9eb40d4df3b70fd526 +size 21687 diff --git a/checkpoint-3800/rng_state_5.pth b/checkpoint-3800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..591e6852d7351c786e459a72f802520c18c65d4a --- /dev/null +++ b/checkpoint-3800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37ece5766f4cc8c7850b00fb1bacc1f41293aa8154fda9bfdf632af3f60c4de2 +size 21687 diff --git a/checkpoint-3800/rng_state_6.pth b/checkpoint-3800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..55e93db9f5d20af20b8ad9464ae66776adce3ab3 --- /dev/null +++ b/checkpoint-3800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:859868662813a246d92212fd5115cd45f4959962393afe1c8599e11a521b0c84 +size 21687 diff --git a/checkpoint-3800/rng_state_7.pth b/checkpoint-3800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f67e21eba8cd98ab013ad51e59eee5c5e007b5f --- /dev/null +++ b/checkpoint-3800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f45d8fd700c5bb34efc27a317181c7f7f692baca412ebe60de6e02e1404972 +size 21687 diff --git a/checkpoint-3800/scheduler.pt b/checkpoint-3800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..519a084ef19c78d083d9d846da62bc4502d781c7 --- /dev/null +++ b/checkpoint-3800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afec17e9b30a67d2d4eb8b5b2e9517c375f0e88c2c6f9025b846093723500b7f +size 627 diff --git a/checkpoint-3800/special_tokens_map.json b/checkpoint-3800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-3800/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-3800/tokenization_chatglm.py b/checkpoint-3800/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-3800/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3800/tokenizer.model b/checkpoint-3800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3800/tokenizer_config.json b/checkpoint-3800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-3800/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-3800/trainer_state.json b/checkpoint-3800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2eb304974cac455231717e90a58ff7de83948db8 --- /dev/null +++ b/checkpoint-3800/trainer_state.json @@ -0,0 +1,2299 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5464901993633777, + "eval_steps": 500, + "global_step": 3800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + }, + { + "epoch": 2.29, + "learning_rate": 6.6890555440403015e-06, + "loss": 0.3808, + "step": 3410 + }, + { + "epoch": 2.29, + "learning_rate": 6.570042455438822e-06, + "loss": 0.3797, + "step": 3420 + }, + { + "epoch": 2.3, + "learning_rate": 6.451937275461736e-06, + "loss": 0.3739, + "step": 3430 + }, + { + "epoch": 2.31, + "learning_rate": 6.334745822283699e-06, + "loss": 0.3748, + "step": 3440 + }, + { + "epoch": 2.31, + "learning_rate": 6.2184738690667214e-06, + "loss": 0.3769, + "step": 3450 + }, + { + "epoch": 2.32, + "learning_rate": 6.103127143675832e-06, + "loss": 0.3756, + "step": 3460 + }, + { + "epoch": 2.33, + "learning_rate": 5.988711328396859e-06, + "loss": 0.3738, + "step": 3470 + }, + { + "epoch": 2.33, + "learning_rate": 5.875232059656552e-06, + "loss": 0.3676, + "step": 3480 + }, + { + "epoch": 2.34, + "learning_rate": 5.762694927744866e-06, + "loss": 0.3737, + "step": 3490 + }, + { + "epoch": 2.35, + "learning_rate": 5.651105476539623e-06, + "loss": 0.369, + "step": 3500 + }, + { + "epoch": 2.35, + "learning_rate": 5.540469203233347e-06, + "loss": 0.3723, + "step": 3510 + }, + { + "epoch": 2.36, + "learning_rate": 5.430791558062518e-06, + "loss": 0.3791, + "step": 3520 + }, + { + "epoch": 2.37, + "learning_rate": 5.322077944039039e-06, + "loss": 0.3753, + "step": 3530 + }, + { + "epoch": 2.37, + "learning_rate": 5.21433371668407e-06, + "loss": 0.3703, + "step": 3540 + }, + { + "epoch": 2.38, + "learning_rate": 5.107564183764219e-06, + "loss": 0.3781, + "step": 3550 + }, + { + "epoch": 2.39, + "learning_rate": 5.001774605030074e-06, + "loss": 0.3766, + "step": 3560 + }, + { + "epoch": 2.39, + "learning_rate": 4.8969701919570454e-06, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.4, + "learning_rate": 4.7931561074887e-06, + "loss": 0.3681, + "step": 3580 + }, + { + "epoch": 2.41, + "learning_rate": 4.690337465782366e-06, + "loss": 0.3752, + "step": 3590 + }, + { + "epoch": 2.41, + "learning_rate": 4.588519331957241e-06, + "loss": 0.3775, + "step": 3600 + }, + { + "epoch": 2.42, + "learning_rate": 4.4877067218448285e-06, + "loss": 0.3677, + "step": 3610 + }, + { + "epoch": 2.43, + "learning_rate": 4.38790460174188e-06, + "loss": 0.3718, + "step": 3620 + }, + { + "epoch": 2.43, + "learning_rate": 4.289117888165708e-06, + "loss": 0.3671, + "step": 3630 + }, + { + "epoch": 2.44, + "learning_rate": 4.191351447612032e-06, + "loss": 0.3728, + "step": 3640 + }, + { + "epoch": 2.45, + "learning_rate": 4.094610096315199e-06, + "loss": 0.3769, + "step": 3650 + }, + { + "epoch": 2.45, + "learning_rate": 3.998898600010928e-06, + "loss": 0.3777, + "step": 3660 + }, + { + "epoch": 2.46, + "learning_rate": 3.904221673701566e-06, + "loss": 0.3817, + "step": 3670 + }, + { + "epoch": 2.47, + "learning_rate": 3.810583981423796e-06, + "loss": 0.383, + "step": 3680 + }, + { + "epoch": 2.47, + "learning_rate": 3.7179901360188533e-06, + "loss": 0.3719, + "step": 3690 + }, + { + "epoch": 2.48, + "learning_rate": 3.626444698905329e-06, + "loss": 0.3716, + "step": 3700 + }, + { + "epoch": 2.49, + "learning_rate": 3.5359521798544347e-06, + "loss": 0.3736, + "step": 3710 + }, + { + "epoch": 2.49, + "learning_rate": 3.4465170367678294e-06, + "loss": 0.3741, + "step": 3720 + }, + { + "epoch": 2.5, + "learning_rate": 3.3581436754580363e-06, + "loss": 0.3756, + "step": 3730 + }, + { + "epoch": 2.51, + "learning_rate": 3.270836449431397e-06, + "loss": 0.3777, + "step": 3740 + }, + { + "epoch": 2.51, + "learning_rate": 3.184599659673579e-06, + "loss": 0.3774, + "step": 3750 + }, + { + "epoch": 2.52, + "learning_rate": 3.0994375544377424e-06, + "loss": 0.3785, + "step": 3760 + }, + { + "epoch": 2.53, + "learning_rate": 3.0153543290352164e-06, + "loss": 0.3768, + "step": 3770 + }, + { + "epoch": 2.53, + "learning_rate": 2.932354125628853e-06, + "loss": 0.377, + "step": 3780 + }, + { + "epoch": 2.54, + "learning_rate": 2.8504410330289778e-06, + "loss": 0.3803, + "step": 3790 + }, + { + "epoch": 2.55, + "learning_rate": 2.769619086491923e-06, + "loss": 0.3706, + "step": 3800 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 3.4569098601837887e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3800/training_args.bin b/checkpoint-3800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-3800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.bin b/checkpoint-400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cfd8e33f4ee65fa597625fe7ddfe9842eeee1645 --- /dev/null +++ b/checkpoint-400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b16ff5a813229551f0851a18c223983461277b4cff5474842bea5cb7ecdd516 +size 7820185 diff --git a/checkpoint-400/added_tokens.json b/checkpoint-400/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-400/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..798c4bbd1bce458b42bfd92b3c8b4a8b22d0e894 --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:907e0770908e39c39683e54cb3ccc50c144d0fedde96ff1e57ebce73093e89ac +size 15644485 diff --git a/checkpoint-400/rng_state_0.pth b/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b1bedcce0bf400a24afbe78ff446284552daa308 --- /dev/null +++ b/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87113f9a60c85ba1cfbf4dcb511c9def8e4559e6982acba15aa8d680fadad5a4 +size 21687 diff --git a/checkpoint-400/rng_state_1.pth b/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f12cf8c1aeecc247a62b84c38d0405a34b098c6d --- /dev/null +++ b/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f760f386865c0aa41405a9826b9161050f5a0c0a2ba970e7adbefebe0bd8f67 +size 21687 diff --git a/checkpoint-400/rng_state_2.pth b/checkpoint-400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e5a40bf5eee653e4ec7f57c12f86b7e4f00fd695 --- /dev/null +++ b/checkpoint-400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b4f814b1a01997d8a9c0db88ef704c72ecae0cfb22234e80fff419ca262e7b +size 21687 diff --git a/checkpoint-400/rng_state_3.pth b/checkpoint-400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e75a88cf59428196e9652b0b9ea54a629b6c2b29 --- /dev/null +++ b/checkpoint-400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721ded189bdeaa07801ba403b2dddb773508a74620c863123ba0dc63371f08f0 +size 21687 diff --git a/checkpoint-400/rng_state_4.pth b/checkpoint-400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..826926c60f67d03ccc40a1868f3707bd6f2b3fa2 --- /dev/null +++ b/checkpoint-400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27e4e6fdbfaed1c53b66c1b7fbf75d831c823adba1bec5e43e68254dd9941be3 +size 21687 diff --git a/checkpoint-400/rng_state_5.pth b/checkpoint-400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b31871f9a099cb906128612cda5ccc51ad09340 --- /dev/null +++ b/checkpoint-400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf1625b664fb7a96b417350e2155eed05413e028691def18d3a6a05061074917 +size 21687 diff --git a/checkpoint-400/rng_state_6.pth b/checkpoint-400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f08b5602f81e2a2154cc719869d4a90e931b4f2 --- /dev/null +++ b/checkpoint-400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5f04bf5b9875135131743cc973dbfe0705a4ebef58703686d714968b9e66f7 +size 21687 diff --git a/checkpoint-400/rng_state_7.pth b/checkpoint-400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..149916be3ed43ce9e1238dce231099ade6ece832 --- /dev/null +++ b/checkpoint-400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ed3d012cc26fa700c659c363523166bc0cf31ae6d84310d4165dd2a5bad0a7 +size 21687 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ab7b03af1b2bf89b8350524b4a10db978d103f2 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1252321247b0e19d08cc5640b43c0e102b0814a4c5849bf24e765d050ce4362d +size 627 diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-400/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-400/tokenization_chatglm.py b/checkpoint-400/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-400/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..45d3ce04b6d41d61ee013c8f0fe2537d8176ad44 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,259 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2680515999329871, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 3.6377843599451095e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-4000/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-4000/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-4000/adapter_model.bin b/checkpoint-4000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..81461885ac527eac26553c147794396b2a378b8d --- /dev/null +++ b/checkpoint-4000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc181329b26978a74eb89f2b085e60e25095a4c99172a47c74c082ad57ab4807 +size 7820185 diff --git a/checkpoint-4000/added_tokens.json b/checkpoint-4000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-4000/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff36a73de69db18bf806a1906dfc74a4a922b9a0 --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f8a618322d769c1944a25d310c0a68d782a7006dfcca4360ff615422681bce3 +size 15644485 diff --git a/checkpoint-4000/rng_state_0.pth b/checkpoint-4000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b7690bfdaa44225acfc650ee1eef0cee1a2cea7 --- /dev/null +++ b/checkpoint-4000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:491c75b5394adc5adc4279ca59e50706499ce05765c703be2e2802c3eaaafb1f +size 21687 diff --git a/checkpoint-4000/rng_state_1.pth b/checkpoint-4000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2886ccdd746ca9a021db06c8f687d87852741928 --- /dev/null +++ b/checkpoint-4000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761a9bf2dea3ab00655c8255419c6150090f8ef518a76df343bab74a7fab98b4 +size 21687 diff --git a/checkpoint-4000/rng_state_2.pth b/checkpoint-4000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a39c4ab68b8a7b594d5191a6efa04971de2cdd6 --- /dev/null +++ b/checkpoint-4000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0152107f72ee1935c7430a345af2506e407ace6e5cbc64085a32f349a1fe69 +size 21687 diff --git a/checkpoint-4000/rng_state_3.pth b/checkpoint-4000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3da98bfda701a69144973813265874ddd48038b --- /dev/null +++ b/checkpoint-4000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc4d43ddb316d1bccb75b3866b43bf53588523ecaad1fd6366488b2e0e9c529 +size 21687 diff --git a/checkpoint-4000/rng_state_4.pth b/checkpoint-4000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..f880d8dbc12a7a2373364d036c351f69d8e54856 --- /dev/null +++ b/checkpoint-4000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c9f12d9a5a0a1e402aed3b797f32671c0352a20dc2e53ca5af272f0be725c5 +size 21687 diff --git a/checkpoint-4000/rng_state_5.pth b/checkpoint-4000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..59e92c172a4683e3a1ca1109cd368c2e86869e06 --- /dev/null +++ b/checkpoint-4000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e6f344c86086bedb468a06fabd3d8147e4217dd28ef03e72547ace779b8de3d +size 21687 diff --git a/checkpoint-4000/rng_state_6.pth b/checkpoint-4000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7762ff678797975b5a74db4bee4129c93d5c682 --- /dev/null +++ b/checkpoint-4000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa9fb54f1958ad411d849a4e2bb2db13093d9f3303ecaff0cc2e5e1181714b74 +size 21687 diff --git a/checkpoint-4000/rng_state_7.pth b/checkpoint-4000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a8661e30fef6956d9eaabc3211605297f5079c0 --- /dev/null +++ b/checkpoint-4000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53763248a6a4438ed08c580bfd8368a19e8fa9ecd88052f80ab30ada1da2f754 +size 21687 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..64d268049d424fb9124ed781072b1e10f7882365 --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07283e4f0470b5ca743dc40bc7a0f23435793ff9269846cc3edda816f64fd8ed +size 627 diff --git a/checkpoint-4000/special_tokens_map.json b/checkpoint-4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-4000/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-4000/tokenization_chatglm.py b/checkpoint-4000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-4000/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-4000/tokenizer.model b/checkpoint-4000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-4000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-4000/tokenizer_config.json b/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7564f906884e2e0ead4762dae6f4a99c94b04509 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,2419 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.680515999329871, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + }, + { + "epoch": 2.29, + "learning_rate": 6.6890555440403015e-06, + "loss": 0.3808, + "step": 3410 + }, + { + "epoch": 2.29, + "learning_rate": 6.570042455438822e-06, + "loss": 0.3797, + "step": 3420 + }, + { + "epoch": 2.3, + "learning_rate": 6.451937275461736e-06, + "loss": 0.3739, + "step": 3430 + }, + { + "epoch": 2.31, + "learning_rate": 6.334745822283699e-06, + "loss": 0.3748, + "step": 3440 + }, + { + "epoch": 2.31, + "learning_rate": 6.2184738690667214e-06, + "loss": 0.3769, + "step": 3450 + }, + { + "epoch": 2.32, + "learning_rate": 6.103127143675832e-06, + "loss": 0.3756, + "step": 3460 + }, + { + "epoch": 2.33, + "learning_rate": 5.988711328396859e-06, + "loss": 0.3738, + "step": 3470 + }, + { + "epoch": 2.33, + "learning_rate": 5.875232059656552e-06, + "loss": 0.3676, + "step": 3480 + }, + { + "epoch": 2.34, + "learning_rate": 5.762694927744866e-06, + "loss": 0.3737, + "step": 3490 + }, + { + "epoch": 2.35, + "learning_rate": 5.651105476539623e-06, + "loss": 0.369, + "step": 3500 + }, + { + "epoch": 2.35, + "learning_rate": 5.540469203233347e-06, + "loss": 0.3723, + "step": 3510 + }, + { + "epoch": 2.36, + "learning_rate": 5.430791558062518e-06, + "loss": 0.3791, + "step": 3520 + }, + { + "epoch": 2.37, + "learning_rate": 5.322077944039039e-06, + "loss": 0.3753, + "step": 3530 + }, + { + "epoch": 2.37, + "learning_rate": 5.21433371668407e-06, + "loss": 0.3703, + "step": 3540 + }, + { + "epoch": 2.38, + "learning_rate": 5.107564183764219e-06, + "loss": 0.3781, + "step": 3550 + }, + { + "epoch": 2.39, + "learning_rate": 5.001774605030074e-06, + "loss": 0.3766, + "step": 3560 + }, + { + "epoch": 2.39, + "learning_rate": 4.8969701919570454e-06, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.4, + "learning_rate": 4.7931561074887e-06, + "loss": 0.3681, + "step": 3580 + }, + { + "epoch": 2.41, + "learning_rate": 4.690337465782366e-06, + "loss": 0.3752, + "step": 3590 + }, + { + "epoch": 2.41, + "learning_rate": 4.588519331957241e-06, + "loss": 0.3775, + "step": 3600 + }, + { + "epoch": 2.42, + "learning_rate": 4.4877067218448285e-06, + "loss": 0.3677, + "step": 3610 + }, + { + "epoch": 2.43, + "learning_rate": 4.38790460174188e-06, + "loss": 0.3718, + "step": 3620 + }, + { + "epoch": 2.43, + "learning_rate": 4.289117888165708e-06, + "loss": 0.3671, + "step": 3630 + }, + { + "epoch": 2.44, + "learning_rate": 4.191351447612032e-06, + "loss": 0.3728, + "step": 3640 + }, + { + "epoch": 2.45, + "learning_rate": 4.094610096315199e-06, + "loss": 0.3769, + "step": 3650 + }, + { + "epoch": 2.45, + "learning_rate": 3.998898600010928e-06, + "loss": 0.3777, + "step": 3660 + }, + { + "epoch": 2.46, + "learning_rate": 3.904221673701566e-06, + "loss": 0.3817, + "step": 3670 + }, + { + "epoch": 2.47, + "learning_rate": 3.810583981423796e-06, + "loss": 0.383, + "step": 3680 + }, + { + "epoch": 2.47, + "learning_rate": 3.7179901360188533e-06, + "loss": 0.3719, + "step": 3690 + }, + { + "epoch": 2.48, + "learning_rate": 3.626444698905329e-06, + "loss": 0.3716, + "step": 3700 + }, + { + "epoch": 2.49, + "learning_rate": 3.5359521798544347e-06, + "loss": 0.3736, + "step": 3710 + }, + { + "epoch": 2.49, + "learning_rate": 3.4465170367678294e-06, + "loss": 0.3741, + "step": 3720 + }, + { + "epoch": 2.5, + "learning_rate": 3.3581436754580363e-06, + "loss": 0.3756, + "step": 3730 + }, + { + "epoch": 2.51, + "learning_rate": 3.270836449431397e-06, + "loss": 0.3777, + "step": 3740 + }, + { + "epoch": 2.51, + "learning_rate": 3.184599659673579e-06, + "loss": 0.3774, + "step": 3750 + }, + { + "epoch": 2.52, + "learning_rate": 3.0994375544377424e-06, + "loss": 0.3785, + "step": 3760 + }, + { + "epoch": 2.53, + "learning_rate": 3.0153543290352164e-06, + "loss": 0.3768, + "step": 3770 + }, + { + "epoch": 2.53, + "learning_rate": 2.932354125628853e-06, + "loss": 0.377, + "step": 3780 + }, + { + "epoch": 2.54, + "learning_rate": 2.8504410330289778e-06, + "loss": 0.3803, + "step": 3790 + }, + { + "epoch": 2.55, + "learning_rate": 2.769619086491923e-06, + "loss": 0.3706, + "step": 3800 + }, + { + "epoch": 2.55, + "learning_rate": 2.6898922675213016e-06, + "loss": 0.3712, + "step": 3810 + }, + { + "epoch": 2.56, + "learning_rate": 2.611264503671823e-06, + "loss": 0.369, + "step": 3820 + }, + { + "epoch": 2.57, + "learning_rate": 2.533739668355814e-06, + "loss": 0.3726, + "step": 3830 + }, + { + "epoch": 2.57, + "learning_rate": 2.45732158065243e-06, + "loss": 0.3746, + "step": 3840 + }, + { + "epoch": 2.58, + "learning_rate": 2.382014005119501e-06, + "loss": 0.3701, + "step": 3850 + }, + { + "epoch": 2.59, + "learning_rate": 2.3078206516080695e-06, + "loss": 0.38, + "step": 3860 + }, + { + "epoch": 2.59, + "learning_rate": 2.2347451750796474e-06, + "loss": 0.3725, + "step": 3870 + }, + { + "epoch": 2.6, + "learning_rate": 2.1627911754261653e-06, + "loss": 0.3825, + "step": 3880 + }, + { + "epoch": 2.61, + "learning_rate": 2.0919621972926156e-06, + "loss": 0.3791, + "step": 3890 + }, + { + "epoch": 2.61, + "learning_rate": 2.022261729902458e-06, + "loss": 0.3778, + "step": 3900 + }, + { + "epoch": 2.62, + "learning_rate": 1.953693206885715e-06, + "loss": 0.3735, + "step": 3910 + }, + { + "epoch": 2.63, + "learning_rate": 1.8862600061098106e-06, + "loss": 0.3816, + "step": 3920 + }, + { + "epoch": 2.63, + "learning_rate": 1.8199654495131974e-06, + "loss": 0.3752, + "step": 3930 + }, + { + "epoch": 2.64, + "learning_rate": 1.754812802941691e-06, + "loss": 0.3739, + "step": 3940 + }, + { + "epoch": 2.65, + "learning_rate": 1.6908052759875836e-06, + "loss": 0.3745, + "step": 3950 + }, + { + "epoch": 2.65, + "learning_rate": 1.6279460218315361e-06, + "loss": 0.3753, + "step": 3960 + }, + { + "epoch": 2.66, + "learning_rate": 1.5662381370872532e-06, + "loss": 0.3736, + "step": 3970 + }, + { + "epoch": 2.67, + "learning_rate": 1.5056846616489124e-06, + "loss": 0.3755, + "step": 3980 + }, + { + "epoch": 2.67, + "learning_rate": 1.4462885785414327e-06, + "loss": 0.3741, + "step": 3990 + }, + { + "epoch": 2.68, + "learning_rate": 1.3880528137735132e-06, + "loss": 0.3708, + "step": 4000 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 3.6390093645043204e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-4200/README.md b/checkpoint-4200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-4200/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-4200/adapter_config.json b/checkpoint-4200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-4200/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-4200/adapter_model.bin b/checkpoint-4200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3726dcf2125981fb135858f62e2fff38ec260030 --- /dev/null +++ b/checkpoint-4200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da965acea7b772dfe33a204e0366bfb18264f6b9591255cdb22efde6325afc08 +size 7820185 diff --git a/checkpoint-4200/added_tokens.json b/checkpoint-4200/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-4200/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-4200/optimizer.pt b/checkpoint-4200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..652ecce8803e1d4de705efa5aaef0b4e5e4cb64f --- /dev/null +++ b/checkpoint-4200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2b15412d698e5cf9e6711a778387681e5470d48efc973b2d47980f1685416b8 +size 15644485 diff --git a/checkpoint-4200/rng_state_0.pth b/checkpoint-4200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..91b2a04a393ad9e0c64f8fcda61dfa0af4fe438b --- /dev/null +++ b/checkpoint-4200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86b6fdb9039590d3e3a7cbeab8532f2f51861a38daed9a8dccb798a30bcc366a +size 21687 diff --git a/checkpoint-4200/rng_state_1.pth b/checkpoint-4200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2bc3d3412035f18168878ab29583fb4f8296557 --- /dev/null +++ b/checkpoint-4200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c7b2c576853f8574e0dc4a43c353f3f32cb91d01be110a55669e7c7b8da8e7c +size 21687 diff --git a/checkpoint-4200/rng_state_2.pth b/checkpoint-4200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..87895c9258c28adf2e3ed9b6639908de162ba159 --- /dev/null +++ b/checkpoint-4200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee23b26d142df2942621bac8ffb26409a952ee6a1dc2d30918df2bfcdc82e5c1 +size 21687 diff --git a/checkpoint-4200/rng_state_3.pth b/checkpoint-4200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ca5509a203d3084d7ae6de6857d6caf22307787 --- /dev/null +++ b/checkpoint-4200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7c45e809491c089f2753bfdc6b91ae043476550138923c03d40cf28f3c291c +size 21687 diff --git a/checkpoint-4200/rng_state_4.pth b/checkpoint-4200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..4da9159ca87f8fbb56e47e2d35521a829223ba9b --- /dev/null +++ b/checkpoint-4200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edb5fe33d221f1a35425b3ad6cd5405f1e693e194625ceb2a6ad8a7eabc2c031 +size 21687 diff --git a/checkpoint-4200/rng_state_5.pth b/checkpoint-4200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae473e4d6066402f277548afe272d4c2d73e328a --- /dev/null +++ b/checkpoint-4200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1666eefd5c594a529b23b10132db0f9acd3daacfbb20d2ce6b462ab5293c96 +size 21687 diff --git a/checkpoint-4200/rng_state_6.pth b/checkpoint-4200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..95a907424aecebe50bb4dd6e1c636f29b7c75247 --- /dev/null +++ b/checkpoint-4200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6367c95b224fadf290e1e971e1421637697d6bcb68680165eb9e2d5df2d0dfd +size 21687 diff --git a/checkpoint-4200/rng_state_7.pth b/checkpoint-4200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..186f7fb9146015e414003a887d552642294a1829 --- /dev/null +++ b/checkpoint-4200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014c54782c433ffff425ff6b6bc722e460783162d27666bed6299d980085898a +size 21687 diff --git a/checkpoint-4200/scheduler.pt b/checkpoint-4200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f1167f0b4d473e305a20d95cde911ef304dde49 --- /dev/null +++ b/checkpoint-4200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378c7d713807e4a151c257b2e7de1f358a8192a2c2f54e827f895e3f73322344 +size 627 diff --git a/checkpoint-4200/special_tokens_map.json b/checkpoint-4200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-4200/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-4200/tokenization_chatglm.py b/checkpoint-4200/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-4200/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-4200/tokenizer.model b/checkpoint-4200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-4200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-4200/tokenizer_config.json b/checkpoint-4200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-4200/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-4200/trainer_state.json b/checkpoint-4200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..81fec5ddf985c8b5a0d7c747434d23795989133b --- /dev/null +++ b/checkpoint-4200/trainer_state.json @@ -0,0 +1,2539 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8145417992963644, + "eval_steps": 500, + "global_step": 4200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + }, + { + "epoch": 2.29, + "learning_rate": 6.6890555440403015e-06, + "loss": 0.3808, + "step": 3410 + }, + { + "epoch": 2.29, + "learning_rate": 6.570042455438822e-06, + "loss": 0.3797, + "step": 3420 + }, + { + "epoch": 2.3, + "learning_rate": 6.451937275461736e-06, + "loss": 0.3739, + "step": 3430 + }, + { + "epoch": 2.31, + "learning_rate": 6.334745822283699e-06, + "loss": 0.3748, + "step": 3440 + }, + { + "epoch": 2.31, + "learning_rate": 6.2184738690667214e-06, + "loss": 0.3769, + "step": 3450 + }, + { + "epoch": 2.32, + "learning_rate": 6.103127143675832e-06, + "loss": 0.3756, + "step": 3460 + }, + { + "epoch": 2.33, + "learning_rate": 5.988711328396859e-06, + "loss": 0.3738, + "step": 3470 + }, + { + "epoch": 2.33, + "learning_rate": 5.875232059656552e-06, + "loss": 0.3676, + "step": 3480 + }, + { + "epoch": 2.34, + "learning_rate": 5.762694927744866e-06, + "loss": 0.3737, + "step": 3490 + }, + { + "epoch": 2.35, + "learning_rate": 5.651105476539623e-06, + "loss": 0.369, + "step": 3500 + }, + { + "epoch": 2.35, + "learning_rate": 5.540469203233347e-06, + "loss": 0.3723, + "step": 3510 + }, + { + "epoch": 2.36, + "learning_rate": 5.430791558062518e-06, + "loss": 0.3791, + "step": 3520 + }, + { + "epoch": 2.37, + "learning_rate": 5.322077944039039e-06, + "loss": 0.3753, + "step": 3530 + }, + { + "epoch": 2.37, + "learning_rate": 5.21433371668407e-06, + "loss": 0.3703, + "step": 3540 + }, + { + "epoch": 2.38, + "learning_rate": 5.107564183764219e-06, + "loss": 0.3781, + "step": 3550 + }, + { + "epoch": 2.39, + "learning_rate": 5.001774605030074e-06, + "loss": 0.3766, + "step": 3560 + }, + { + "epoch": 2.39, + "learning_rate": 4.8969701919570454e-06, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.4, + "learning_rate": 4.7931561074887e-06, + "loss": 0.3681, + "step": 3580 + }, + { + "epoch": 2.41, + "learning_rate": 4.690337465782366e-06, + "loss": 0.3752, + "step": 3590 + }, + { + "epoch": 2.41, + "learning_rate": 4.588519331957241e-06, + "loss": 0.3775, + "step": 3600 + }, + { + "epoch": 2.42, + "learning_rate": 4.4877067218448285e-06, + "loss": 0.3677, + "step": 3610 + }, + { + "epoch": 2.43, + "learning_rate": 4.38790460174188e-06, + "loss": 0.3718, + "step": 3620 + }, + { + "epoch": 2.43, + "learning_rate": 4.289117888165708e-06, + "loss": 0.3671, + "step": 3630 + }, + { + "epoch": 2.44, + "learning_rate": 4.191351447612032e-06, + "loss": 0.3728, + "step": 3640 + }, + { + "epoch": 2.45, + "learning_rate": 4.094610096315199e-06, + "loss": 0.3769, + "step": 3650 + }, + { + "epoch": 2.45, + "learning_rate": 3.998898600010928e-06, + "loss": 0.3777, + "step": 3660 + }, + { + "epoch": 2.46, + "learning_rate": 3.904221673701566e-06, + "loss": 0.3817, + "step": 3670 + }, + { + "epoch": 2.47, + "learning_rate": 3.810583981423796e-06, + "loss": 0.383, + "step": 3680 + }, + { + "epoch": 2.47, + "learning_rate": 3.7179901360188533e-06, + "loss": 0.3719, + "step": 3690 + }, + { + "epoch": 2.48, + "learning_rate": 3.626444698905329e-06, + "loss": 0.3716, + "step": 3700 + }, + { + "epoch": 2.49, + "learning_rate": 3.5359521798544347e-06, + "loss": 0.3736, + "step": 3710 + }, + { + "epoch": 2.49, + "learning_rate": 3.4465170367678294e-06, + "loss": 0.3741, + "step": 3720 + }, + { + "epoch": 2.5, + "learning_rate": 3.3581436754580363e-06, + "loss": 0.3756, + "step": 3730 + }, + { + "epoch": 2.51, + "learning_rate": 3.270836449431397e-06, + "loss": 0.3777, + "step": 3740 + }, + { + "epoch": 2.51, + "learning_rate": 3.184599659673579e-06, + "loss": 0.3774, + "step": 3750 + }, + { + "epoch": 2.52, + "learning_rate": 3.0994375544377424e-06, + "loss": 0.3785, + "step": 3760 + }, + { + "epoch": 2.53, + "learning_rate": 3.0153543290352164e-06, + "loss": 0.3768, + "step": 3770 + }, + { + "epoch": 2.53, + "learning_rate": 2.932354125628853e-06, + "loss": 0.377, + "step": 3780 + }, + { + "epoch": 2.54, + "learning_rate": 2.8504410330289778e-06, + "loss": 0.3803, + "step": 3790 + }, + { + "epoch": 2.55, + "learning_rate": 2.769619086491923e-06, + "loss": 0.3706, + "step": 3800 + }, + { + "epoch": 2.55, + "learning_rate": 2.6898922675213016e-06, + "loss": 0.3712, + "step": 3810 + }, + { + "epoch": 2.56, + "learning_rate": 2.611264503671823e-06, + "loss": 0.369, + "step": 3820 + }, + { + "epoch": 2.57, + "learning_rate": 2.533739668355814e-06, + "loss": 0.3726, + "step": 3830 + }, + { + "epoch": 2.57, + "learning_rate": 2.45732158065243e-06, + "loss": 0.3746, + "step": 3840 + }, + { + "epoch": 2.58, + "learning_rate": 2.382014005119501e-06, + "loss": 0.3701, + "step": 3850 + }, + { + "epoch": 2.59, + "learning_rate": 2.3078206516080695e-06, + "loss": 0.38, + "step": 3860 + }, + { + "epoch": 2.59, + "learning_rate": 2.2347451750796474e-06, + "loss": 0.3725, + "step": 3870 + }, + { + "epoch": 2.6, + "learning_rate": 2.1627911754261653e-06, + "loss": 0.3825, + "step": 3880 + }, + { + "epoch": 2.61, + "learning_rate": 2.0919621972926156e-06, + "loss": 0.3791, + "step": 3890 + }, + { + "epoch": 2.61, + "learning_rate": 2.022261729902458e-06, + "loss": 0.3778, + "step": 3900 + }, + { + "epoch": 2.62, + "learning_rate": 1.953693206885715e-06, + "loss": 0.3735, + "step": 3910 + }, + { + "epoch": 2.63, + "learning_rate": 1.8862600061098106e-06, + "loss": 0.3816, + "step": 3920 + }, + { + "epoch": 2.63, + "learning_rate": 1.8199654495131974e-06, + "loss": 0.3752, + "step": 3930 + }, + { + "epoch": 2.64, + "learning_rate": 1.754812802941691e-06, + "loss": 0.3739, + "step": 3940 + }, + { + "epoch": 2.65, + "learning_rate": 1.6908052759875836e-06, + "loss": 0.3745, + "step": 3950 + }, + { + "epoch": 2.65, + "learning_rate": 1.6279460218315361e-06, + "loss": 0.3753, + "step": 3960 + }, + { + "epoch": 2.66, + "learning_rate": 1.5662381370872532e-06, + "loss": 0.3736, + "step": 3970 + }, + { + "epoch": 2.67, + "learning_rate": 1.5056846616489124e-06, + "loss": 0.3755, + "step": 3980 + }, + { + "epoch": 2.67, + "learning_rate": 1.4462885785414327e-06, + "loss": 0.3741, + "step": 3990 + }, + { + "epoch": 2.68, + "learning_rate": 1.3880528137735132e-06, + "loss": 0.3708, + "step": 4000 + }, + { + "epoch": 2.69, + "learning_rate": 1.3309802361934936e-06, + "loss": 0.3703, + "step": 4010 + }, + { + "epoch": 2.69, + "learning_rate": 1.2750736573480248e-06, + "loss": 0.3784, + "step": 4020 + }, + { + "epoch": 2.7, + "learning_rate": 1.2203358313435609e-06, + "loss": 0.3785, + "step": 4030 + }, + { + "epoch": 2.71, + "learning_rate": 1.1667694547106978e-06, + "loss": 0.3832, + "step": 4040 + }, + { + "epoch": 2.71, + "learning_rate": 1.1143771662713214e-06, + "loss": 0.3708, + "step": 4050 + }, + { + "epoch": 2.72, + "learning_rate": 1.063161547008612e-06, + "loss": 0.3777, + "step": 4060 + }, + { + "epoch": 2.73, + "learning_rate": 1.0131251199399089e-06, + "loss": 0.375, + "step": 4070 + }, + { + "epoch": 2.73, + "learning_rate": 9.642703499924216e-07, + "loss": 0.3719, + "step": 4080 + }, + { + "epoch": 2.74, + "learning_rate": 9.16599643881777e-07, + "loss": 0.3776, + "step": 4090 + }, + { + "epoch": 2.75, + "learning_rate": 8.701153499934833e-07, + "loss": 0.377, + "step": 4100 + }, + { + "epoch": 2.75, + "learning_rate": 8.248197582672395e-07, + "loss": 0.3759, + "step": 4110 + }, + { + "epoch": 2.76, + "learning_rate": 7.807151000841118e-07, + "loss": 0.3727, + "step": 4120 + }, + { + "epoch": 2.77, + "learning_rate": 7.378035481566181e-07, + "loss": 0.374, + "step": 4130 + }, + { + "epoch": 2.77, + "learning_rate": 6.960872164217064e-07, + "loss": 0.3792, + "step": 4140 + }, + { + "epoch": 2.78, + "learning_rate": 6.555681599365926e-07, + "loss": 0.3692, + "step": 4150 + }, + { + "epoch": 2.79, + "learning_rate": 6.16248374777545e-07, + "loss": 0.3736, + "step": 4160 + }, + { + "epoch": 2.79, + "learning_rate": 5.781297979415456e-07, + "loss": 0.3695, + "step": 4170 + }, + { + "epoch": 2.8, + "learning_rate": 5.412143072508563e-07, + "loss": 0.3716, + "step": 4180 + }, + { + "epoch": 2.81, + "learning_rate": 5.055037212605279e-07, + "loss": 0.364, + "step": 4190 + }, + { + "epoch": 2.81, + "learning_rate": 4.709997991688114e-07, + "loss": 0.3707, + "step": 4200 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 3.820937932562445e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4200/training_args.bin b/checkpoint-4200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-4200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-4400/README.md b/checkpoint-4400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-4400/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-4400/adapter_config.json b/checkpoint-4400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-4400/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-4400/adapter_model.bin b/checkpoint-4400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfb87d7a88e34f480123f8a09e243313d9b906e4 --- /dev/null +++ b/checkpoint-4400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dce18afede9cf4775720524b5e0433b70fbd1f9466b41e4805a64047fc11536 +size 7820185 diff --git a/checkpoint-4400/added_tokens.json b/checkpoint-4400/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-4400/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-4400/optimizer.pt b/checkpoint-4400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..af59cd2551fdd5764965dcba58e8fefe807c6742 --- /dev/null +++ b/checkpoint-4400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dcedaa6cd961f03985f6bd141125791c4062704fd283ee1d880f0da35f6864a +size 15644485 diff --git a/checkpoint-4400/rng_state_0.pth b/checkpoint-4400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a9233b02635872b32c7a9d386e5e5d50c499678 --- /dev/null +++ b/checkpoint-4400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2cdd62dc064cc2dfcac5a53a32eb57d0945a749502dce43cbdc8ada36edd0e +size 21687 diff --git a/checkpoint-4400/rng_state_1.pth b/checkpoint-4400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9199a69fb2443dec4342f3be310e283370f4137f --- /dev/null +++ b/checkpoint-4400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18120ac639c60aff6620e284b9ac22ff9034ad44feef1d56e1c81f9a5243aa89 +size 21687 diff --git a/checkpoint-4400/rng_state_2.pth b/checkpoint-4400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..77f37d8f3166d01f44bd5c72ad91396be6f7a43c --- /dev/null +++ b/checkpoint-4400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22011ae0435f8dc0c9bc52603a5adacc0d3626bf5a98493f2d45360d29f1bbe2 +size 21687 diff --git a/checkpoint-4400/rng_state_3.pth b/checkpoint-4400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..83fa30c71401954c46625a70585e0bc474aa7c8b --- /dev/null +++ b/checkpoint-4400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:634c7955a5f7111e59c5ec652dd7a6eef06b07e758e3e0f92a77be1c65850cc0 +size 21687 diff --git a/checkpoint-4400/rng_state_4.pth b/checkpoint-4400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..321944f4437974edd0bbc3bbde4bb5fda9366745 --- /dev/null +++ b/checkpoint-4400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbcd6b017c2373354ddd24e682c090ee8dcdbe9d54e2977853c665f0a933644c +size 21687 diff --git a/checkpoint-4400/rng_state_5.pth b/checkpoint-4400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f778d72608dd0653475c273a93893aee0bdad91 --- /dev/null +++ b/checkpoint-4400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee9cd6b29f3e7dc63adf9cd2ae703992dfac97538e7355649d6682f3d142079 +size 21687 diff --git a/checkpoint-4400/rng_state_6.pth b/checkpoint-4400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..14518e1b9719a1a9118da2e39b478c1f36dd3a3e --- /dev/null +++ b/checkpoint-4400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43306a7d247ee4d5447a35be1bcdd081691b66d32626b6d740ac3462a74bdce +size 21687 diff --git a/checkpoint-4400/rng_state_7.pth b/checkpoint-4400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..661b6aef4b0873a6e76d9b6fc0385b902403a290 --- /dev/null +++ b/checkpoint-4400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4587296a400670eba25273d4a94764d4d98e72e5fc1607150d1c31330147087d +size 21687 diff --git a/checkpoint-4400/scheduler.pt b/checkpoint-4400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9a77913bb56f43e51616b8f53e829e64e66eb72 --- /dev/null +++ b/checkpoint-4400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5ee8503bae553820696aa66a6f4b887d0c0c463f6626c391515cf8a3c696513 +size 627 diff --git a/checkpoint-4400/special_tokens_map.json b/checkpoint-4400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-4400/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-4400/tokenization_chatglm.py b/checkpoint-4400/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-4400/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-4400/tokenizer.model b/checkpoint-4400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-4400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-4400/tokenizer_config.json b/checkpoint-4400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-4400/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-4400/trainer_state.json b/checkpoint-4400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d8f54a64a43ca96ef30ab3b6e030291ec57e0bbb --- /dev/null +++ b/checkpoint-4400/trainer_state.json @@ -0,0 +1,2659 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.948567599262858, + "eval_steps": 500, + "global_step": 4400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + }, + { + "epoch": 2.29, + "learning_rate": 6.6890555440403015e-06, + "loss": 0.3808, + "step": 3410 + }, + { + "epoch": 2.29, + "learning_rate": 6.570042455438822e-06, + "loss": 0.3797, + "step": 3420 + }, + { + "epoch": 2.3, + "learning_rate": 6.451937275461736e-06, + "loss": 0.3739, + "step": 3430 + }, + { + "epoch": 2.31, + "learning_rate": 6.334745822283699e-06, + "loss": 0.3748, + "step": 3440 + }, + { + "epoch": 2.31, + "learning_rate": 6.2184738690667214e-06, + "loss": 0.3769, + "step": 3450 + }, + { + "epoch": 2.32, + "learning_rate": 6.103127143675832e-06, + "loss": 0.3756, + "step": 3460 + }, + { + "epoch": 2.33, + "learning_rate": 5.988711328396859e-06, + "loss": 0.3738, + "step": 3470 + }, + { + "epoch": 2.33, + "learning_rate": 5.875232059656552e-06, + "loss": 0.3676, + "step": 3480 + }, + { + "epoch": 2.34, + "learning_rate": 5.762694927744866e-06, + "loss": 0.3737, + "step": 3490 + }, + { + "epoch": 2.35, + "learning_rate": 5.651105476539623e-06, + "loss": 0.369, + "step": 3500 + }, + { + "epoch": 2.35, + "learning_rate": 5.540469203233347e-06, + "loss": 0.3723, + "step": 3510 + }, + { + "epoch": 2.36, + "learning_rate": 5.430791558062518e-06, + "loss": 0.3791, + "step": 3520 + }, + { + "epoch": 2.37, + "learning_rate": 5.322077944039039e-06, + "loss": 0.3753, + "step": 3530 + }, + { + "epoch": 2.37, + "learning_rate": 5.21433371668407e-06, + "loss": 0.3703, + "step": 3540 + }, + { + "epoch": 2.38, + "learning_rate": 5.107564183764219e-06, + "loss": 0.3781, + "step": 3550 + }, + { + "epoch": 2.39, + "learning_rate": 5.001774605030074e-06, + "loss": 0.3766, + "step": 3560 + }, + { + "epoch": 2.39, + "learning_rate": 4.8969701919570454e-06, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.4, + "learning_rate": 4.7931561074887e-06, + "loss": 0.3681, + "step": 3580 + }, + { + "epoch": 2.41, + "learning_rate": 4.690337465782366e-06, + "loss": 0.3752, + "step": 3590 + }, + { + "epoch": 2.41, + "learning_rate": 4.588519331957241e-06, + "loss": 0.3775, + "step": 3600 + }, + { + "epoch": 2.42, + "learning_rate": 4.4877067218448285e-06, + "loss": 0.3677, + "step": 3610 + }, + { + "epoch": 2.43, + "learning_rate": 4.38790460174188e-06, + "loss": 0.3718, + "step": 3620 + }, + { + "epoch": 2.43, + "learning_rate": 4.289117888165708e-06, + "loss": 0.3671, + "step": 3630 + }, + { + "epoch": 2.44, + "learning_rate": 4.191351447612032e-06, + "loss": 0.3728, + "step": 3640 + }, + { + "epoch": 2.45, + "learning_rate": 4.094610096315199e-06, + "loss": 0.3769, + "step": 3650 + }, + { + "epoch": 2.45, + "learning_rate": 3.998898600010928e-06, + "loss": 0.3777, + "step": 3660 + }, + { + "epoch": 2.46, + "learning_rate": 3.904221673701566e-06, + "loss": 0.3817, + "step": 3670 + }, + { + "epoch": 2.47, + "learning_rate": 3.810583981423796e-06, + "loss": 0.383, + "step": 3680 + }, + { + "epoch": 2.47, + "learning_rate": 3.7179901360188533e-06, + "loss": 0.3719, + "step": 3690 + }, + { + "epoch": 2.48, + "learning_rate": 3.626444698905329e-06, + "loss": 0.3716, + "step": 3700 + }, + { + "epoch": 2.49, + "learning_rate": 3.5359521798544347e-06, + "loss": 0.3736, + "step": 3710 + }, + { + "epoch": 2.49, + "learning_rate": 3.4465170367678294e-06, + "loss": 0.3741, + "step": 3720 + }, + { + "epoch": 2.5, + "learning_rate": 3.3581436754580363e-06, + "loss": 0.3756, + "step": 3730 + }, + { + "epoch": 2.51, + "learning_rate": 3.270836449431397e-06, + "loss": 0.3777, + "step": 3740 + }, + { + "epoch": 2.51, + "learning_rate": 3.184599659673579e-06, + "loss": 0.3774, + "step": 3750 + }, + { + "epoch": 2.52, + "learning_rate": 3.0994375544377424e-06, + "loss": 0.3785, + "step": 3760 + }, + { + "epoch": 2.53, + "learning_rate": 3.0153543290352164e-06, + "loss": 0.3768, + "step": 3770 + }, + { + "epoch": 2.53, + "learning_rate": 2.932354125628853e-06, + "loss": 0.377, + "step": 3780 + }, + { + "epoch": 2.54, + "learning_rate": 2.8504410330289778e-06, + "loss": 0.3803, + "step": 3790 + }, + { + "epoch": 2.55, + "learning_rate": 2.769619086491923e-06, + "loss": 0.3706, + "step": 3800 + }, + { + "epoch": 2.55, + "learning_rate": 2.6898922675213016e-06, + "loss": 0.3712, + "step": 3810 + }, + { + "epoch": 2.56, + "learning_rate": 2.611264503671823e-06, + "loss": 0.369, + "step": 3820 + }, + { + "epoch": 2.57, + "learning_rate": 2.533739668355814e-06, + "loss": 0.3726, + "step": 3830 + }, + { + "epoch": 2.57, + "learning_rate": 2.45732158065243e-06, + "loss": 0.3746, + "step": 3840 + }, + { + "epoch": 2.58, + "learning_rate": 2.382014005119501e-06, + "loss": 0.3701, + "step": 3850 + }, + { + "epoch": 2.59, + "learning_rate": 2.3078206516080695e-06, + "loss": 0.38, + "step": 3860 + }, + { + "epoch": 2.59, + "learning_rate": 2.2347451750796474e-06, + "loss": 0.3725, + "step": 3870 + }, + { + "epoch": 2.6, + "learning_rate": 2.1627911754261653e-06, + "loss": 0.3825, + "step": 3880 + }, + { + "epoch": 2.61, + "learning_rate": 2.0919621972926156e-06, + "loss": 0.3791, + "step": 3890 + }, + { + "epoch": 2.61, + "learning_rate": 2.022261729902458e-06, + "loss": 0.3778, + "step": 3900 + }, + { + "epoch": 2.62, + "learning_rate": 1.953693206885715e-06, + "loss": 0.3735, + "step": 3910 + }, + { + "epoch": 2.63, + "learning_rate": 1.8862600061098106e-06, + "loss": 0.3816, + "step": 3920 + }, + { + "epoch": 2.63, + "learning_rate": 1.8199654495131974e-06, + "loss": 0.3752, + "step": 3930 + }, + { + "epoch": 2.64, + "learning_rate": 1.754812802941691e-06, + "loss": 0.3739, + "step": 3940 + }, + { + "epoch": 2.65, + "learning_rate": 1.6908052759875836e-06, + "loss": 0.3745, + "step": 3950 + }, + { + "epoch": 2.65, + "learning_rate": 1.6279460218315361e-06, + "loss": 0.3753, + "step": 3960 + }, + { + "epoch": 2.66, + "learning_rate": 1.5662381370872532e-06, + "loss": 0.3736, + "step": 3970 + }, + { + "epoch": 2.67, + "learning_rate": 1.5056846616489124e-06, + "loss": 0.3755, + "step": 3980 + }, + { + "epoch": 2.67, + "learning_rate": 1.4462885785414327e-06, + "loss": 0.3741, + "step": 3990 + }, + { + "epoch": 2.68, + "learning_rate": 1.3880528137735132e-06, + "loss": 0.3708, + "step": 4000 + }, + { + "epoch": 2.69, + "learning_rate": 1.3309802361934936e-06, + "loss": 0.3703, + "step": 4010 + }, + { + "epoch": 2.69, + "learning_rate": 1.2750736573480248e-06, + "loss": 0.3784, + "step": 4020 + }, + { + "epoch": 2.7, + "learning_rate": 1.2203358313435609e-06, + "loss": 0.3785, + "step": 4030 + }, + { + "epoch": 2.71, + "learning_rate": 1.1667694547106978e-06, + "loss": 0.3832, + "step": 4040 + }, + { + "epoch": 2.71, + "learning_rate": 1.1143771662713214e-06, + "loss": 0.3708, + "step": 4050 + }, + { + "epoch": 2.72, + "learning_rate": 1.063161547008612e-06, + "loss": 0.3777, + "step": 4060 + }, + { + "epoch": 2.73, + "learning_rate": 1.0131251199399089e-06, + "loss": 0.375, + "step": 4070 + }, + { + "epoch": 2.73, + "learning_rate": 9.642703499924216e-07, + "loss": 0.3719, + "step": 4080 + }, + { + "epoch": 2.74, + "learning_rate": 9.16599643881777e-07, + "loss": 0.3776, + "step": 4090 + }, + { + "epoch": 2.75, + "learning_rate": 8.701153499934833e-07, + "loss": 0.377, + "step": 4100 + }, + { + "epoch": 2.75, + "learning_rate": 8.248197582672395e-07, + "loss": 0.3759, + "step": 4110 + }, + { + "epoch": 2.76, + "learning_rate": 7.807151000841118e-07, + "loss": 0.3727, + "step": 4120 + }, + { + "epoch": 2.77, + "learning_rate": 7.378035481566181e-07, + "loss": 0.374, + "step": 4130 + }, + { + "epoch": 2.77, + "learning_rate": 6.960872164217064e-07, + "loss": 0.3792, + "step": 4140 + }, + { + "epoch": 2.78, + "learning_rate": 6.555681599365926e-07, + "loss": 0.3692, + "step": 4150 + }, + { + "epoch": 2.79, + "learning_rate": 6.16248374777545e-07, + "loss": 0.3736, + "step": 4160 + }, + { + "epoch": 2.79, + "learning_rate": 5.781297979415456e-07, + "loss": 0.3695, + "step": 4170 + }, + { + "epoch": 2.8, + "learning_rate": 5.412143072508563e-07, + "loss": 0.3716, + "step": 4180 + }, + { + "epoch": 2.81, + "learning_rate": 5.055037212605279e-07, + "loss": 0.364, + "step": 4190 + }, + { + "epoch": 2.81, + "learning_rate": 4.709997991688114e-07, + "loss": 0.3707, + "step": 4200 + }, + { + "epoch": 2.82, + "learning_rate": 4.377042407304827e-07, + "loss": 0.3833, + "step": 4210 + }, + { + "epoch": 2.83, + "learning_rate": 4.0561868617312316e-07, + "loss": 0.3791, + "step": 4220 + }, + { + "epoch": 2.83, + "learning_rate": 3.747447161163126e-07, + "loss": 0.3743, + "step": 4230 + }, + { + "epoch": 2.84, + "learning_rate": 3.4508385149375764e-07, + "loss": 0.3759, + "step": 4240 + }, + { + "epoch": 2.85, + "learning_rate": 3.166375534783717e-07, + "loss": 0.3667, + "step": 4250 + }, + { + "epoch": 2.85, + "learning_rate": 2.8940722341030126e-07, + "loss": 0.3725, + "step": 4260 + }, + { + "epoch": 2.86, + "learning_rate": 2.6339420272787074e-07, + "loss": 0.3733, + "step": 4270 + }, + { + "epoch": 2.87, + "learning_rate": 2.3859977290152935e-07, + "loss": 0.373, + "step": 4280 + }, + { + "epoch": 2.87, + "learning_rate": 2.1502515537069334e-07, + "loss": 0.3725, + "step": 4290 + }, + { + "epoch": 2.88, + "learning_rate": 1.926715114835914e-07, + "loss": 0.3729, + "step": 4300 + }, + { + "epoch": 2.89, + "learning_rate": 1.7153994244005766e-07, + "loss": 0.3742, + "step": 4310 + }, + { + "epoch": 2.89, + "learning_rate": 1.516314892372639e-07, + "loss": 0.3739, + "step": 4320 + }, + { + "epoch": 2.9, + "learning_rate": 1.3294713261845503e-07, + "loss": 0.3755, + "step": 4330 + }, + { + "epoch": 2.91, + "learning_rate": 1.1548779302463231e-07, + "loss": 0.3702, + "step": 4340 + }, + { + "epoch": 2.92, + "learning_rate": 9.92543305492033e-08, + "loss": 0.3761, + "step": 4350 + }, + { + "epoch": 2.92, + "learning_rate": 8.424754489561038e-08, + "loss": 0.38, + "step": 4360 + }, + { + "epoch": 2.93, + "learning_rate": 7.046817533795102e-08, + "loss": 0.3737, + "step": 4370 + }, + { + "epoch": 2.94, + "learning_rate": 5.7916900684540366e-08, + "loss": 0.3722, + "step": 4380 + }, + { + "epoch": 2.94, + "learning_rate": 4.6594339244479536e-08, + "loss": 0.3805, + "step": 4390 + }, + { + "epoch": 2.95, + "learning_rate": 3.650104879719951e-08, + "loss": 0.3736, + "step": 4400 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 4.0028042210177974e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4400/training_args.bin b/checkpoint-4400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-4400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-600/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-600/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model.bin b/checkpoint-600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b26acd02a8254311e5cec6896a8a318de0a7ec06 --- /dev/null +++ b/checkpoint-600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89347e64b8a06ed3a0163b9ee1f276383d66671e319205ee5b71642ca775eadd +size 7820185 diff --git a/checkpoint-600/added_tokens.json b/checkpoint-600/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-600/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..45f64c485807258fb02eb55c4779ff52bac54a6a --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98acbdc226e6b26fba6650be9176ff323c5c12473d95dd8908bb8c1d0ba7509 +size 15644485 diff --git a/checkpoint-600/rng_state_0.pth b/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bfbd65a5afac3cd2a144318237b10b825519bd0 --- /dev/null +++ b/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52ea747e6e0a768cd232c5b55851953e9c85231518694ac14e243c3ca0803e9 +size 21687 diff --git a/checkpoint-600/rng_state_1.pth b/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..994331c518953acd5298d3336b6ad8a12fa8a170 --- /dev/null +++ b/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71852138c9ea0a503c87ed28c04130f02140e0a2eabe1412a43508eb93ce2745 +size 21687 diff --git a/checkpoint-600/rng_state_2.pth b/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..346c8671649489a2d3e57e8e8d1ccf555b9f3cef --- /dev/null +++ b/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e655e3989866d10a29cfbfa1b85bc854689545d8e49a9cd9119017849fc7f814 +size 21687 diff --git a/checkpoint-600/rng_state_3.pth b/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..783b9c07b53c920768a0bda672398e57c40448fb --- /dev/null +++ b/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac1a5b0a0a759892bb47727f3f2c0c1e7f2b7107df2915d305309680332e489 +size 21687 diff --git a/checkpoint-600/rng_state_4.pth b/checkpoint-600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c802028e48d4b6b8b153e08087683b724847302 --- /dev/null +++ b/checkpoint-600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c12e161add8642c2a20bb36c3dfafb94acb4d9bf93c125b6437ade2de8b7031b +size 21687 diff --git a/checkpoint-600/rng_state_5.pth b/checkpoint-600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3a1fd16c5869530d9e2e9418511dd2228f0498b --- /dev/null +++ b/checkpoint-600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c364841c5eff3331aefc62a93936d6bc22fb72e3d0c157556ded270f007d2d4c +size 21687 diff --git a/checkpoint-600/rng_state_6.pth b/checkpoint-600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc75dd6e8c26ab13c172212c876188c317d87661 --- /dev/null +++ b/checkpoint-600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cb3d4c1121a9488c4f1c1a678ec1d7d72afd3712b08efdef61d29613afd89c +size 21687 diff --git a/checkpoint-600/rng_state_7.pth b/checkpoint-600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..71498fc8a5e681c4283b19201d6bdceb43104596 --- /dev/null +++ b/checkpoint-600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b1192ca660bfc8489909c2e666531a078951136a250c3bcd552d23883bb806 +size 21687 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c97ff13be8d1ec4e7b95d6891622188de77560b4 --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:981cef73e04dd527ce8c44ddc0520292dd774d006596455296dd827a85ee04c7 +size 627 diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-600/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-600/tokenization_chatglm.py b/checkpoint-600/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-600/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bbab254d17aaae0601ffece8e3b33cfd2315c45b --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,379 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.40207739989948066, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 5.45756653015584e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a63cc8bce19d70528c7d99504f86aa1d13ac419d --- /dev/null +++ b/checkpoint-800/README.md @@ -0,0 +1,207 @@ +--- +library_name: peft +base_model: /home/hz/projects/chatglm3-6b-32k +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +## Training procedure + + +### Framework versions + + +- PEFT 0.6.1 diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..96856b610b9126f64e07d8175ad6e4bc470c766f --- /dev/null +++ b/checkpoint-800/adapter_config.json @@ -0,0 +1,22 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/hz/projects/chatglm3-6b-32k", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model.bin b/checkpoint-800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..469bbffe1c4523cebefa35cfa32566a8b84de3ac --- /dev/null +++ b/checkpoint-800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12dae66c89ffeba65cf4d9ea24b443bdb55901ecb988886ff13669ea165521b6 +size 7820185 diff --git a/checkpoint-800/added_tokens.json b/checkpoint-800/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..6d64564b1bf34ffe57bc0a267405abd406b40ea5 --- /dev/null +++ b/checkpoint-800/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|observation|>": 64797, + "<|user|>": 64795 +} diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e2ff7f27842efb732252ab5f7864f2a9548e039 --- /dev/null +++ b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce6ab766bf0353b636992d7df9baecece3b90a1b9636539cfe02f58b7576895 +size 15644485 diff --git a/checkpoint-800/rng_state_0.pth b/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a948f0ccc3f9dcd64a5d7899d494b736c3baaba --- /dev/null +++ b/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b7a9dc0d96e3639a3f0936b2bed5a65710016315a764761f727b0c2633ccc8 +size 21687 diff --git a/checkpoint-800/rng_state_1.pth b/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7cdd5d147b3516bb001069cac1c18759412fcd1c --- /dev/null +++ b/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324b8bd28bcaf1d1a0a930bc7fb6f1768a0d030b91cb68dbbd11684bb4f84682 +size 21687 diff --git a/checkpoint-800/rng_state_2.pth b/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..42df7b20bb3448d9ded7812acfef702e32d5276a --- /dev/null +++ b/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8124781bea2667e12f5f539cfe09539dd0da7b76926af242c582cba83aeb2ed7 +size 21687 diff --git a/checkpoint-800/rng_state_3.pth b/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ee4c6533c1ccb987153012677922abd8f7a1788 --- /dev/null +++ b/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23aced4a9d1c92c2049ece2342510996841785bfe840ebf701b6d4477665d4f3 +size 21687 diff --git a/checkpoint-800/rng_state_4.pth b/checkpoint-800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc9a176a48783ced27c703928bd272cfbe770f02 --- /dev/null +++ b/checkpoint-800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be8813b1dee4baa19de3faa960bc3d1ec8e3ffd6c644a3d3ddaebad8e83eea89 +size 21687 diff --git a/checkpoint-800/rng_state_5.pth b/checkpoint-800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f8cd321ff363e21887c91c496c448e28ff33ee8 --- /dev/null +++ b/checkpoint-800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:709dde790884014a60e6ab586544744ca00fdd8aada396df991249888ee99668 +size 21687 diff --git a/checkpoint-800/rng_state_6.pth b/checkpoint-800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fba0f665fc9c760de13c5cb330cd7e5f5848888 --- /dev/null +++ b/checkpoint-800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f5a38a1962a3b0a398ab38c906c6088e3391e28741830bf991289ebccade14 +size 21687 diff --git a/checkpoint-800/rng_state_7.pth b/checkpoint-800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7393e5705f2473019bc0a27f06ad5331a5fc5943 --- /dev/null +++ b/checkpoint-800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82b83179568147e2432170443b93f3765658fac398425205013e3356e0be586 +size 21687 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..31ea1b09820c0cc6b82f0f0706a71e1d5c30d2e7 --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c1ff9d132bdcd5196d1889072fa78474ec791c826e6c417626b1cd11c7ba218 +size 627 diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/checkpoint-800/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/checkpoint-800/tokenization_chatglm.py b/checkpoint-800/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/checkpoint-800/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/checkpoint-800/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..534033de43adc64fae5716dac8dc192be795186e --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,499 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5361031998659742, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 7.276421116279652e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c56923db88e531b64f53d5cfab0304dc2cb1fa5 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ] +} diff --git a/tokenization_chatglm.py b/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..e50d3295de8175baba69f232d69993abb5f5138a --- /dev/null +++ b/tokenization_chatglm.py @@ -0,0 +1,283 @@ +import json +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>", + "<|observation|>"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83cafe4d8571bcaa70520db976a381f3cd742c51 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..908713cc1dec5552cc9950bf65adcd068336abbb --- /dev/null +++ b/train_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "train_loss": 0.40742741023141216, + "train_runtime": 104521.3326, + "train_samples_per_second": 10.964, + "train_steps_per_second": 0.043 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..872a844bc19efed1ad36ae4da2a123c14e895f7a --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,448 @@ +{"current_steps": 10, "total_steps": 4476, "loss": 1.2211, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.99993842168232e-05, "epoch": 0.01, "percentage": 0.22, "elapsed_time": "0:03:51", "remaining_time": "1 day, 4:45:08"} +{"current_steps": 20, "total_steps": 4476, "loss": 1.0276, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9997536897627915e-05, "epoch": 0.01, "percentage": 0.45, "elapsed_time": "0:07:45", "remaining_time": "1 day, 4:47:04"} +{"current_steps": 30, "total_steps": 4476, "loss": 0.8587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9994458133418e-05, "epoch": 0.02, "percentage": 0.67, "elapsed_time": "0:11:37", "remaining_time": "1 day, 4:42:32"} +{"current_steps": 40, "total_steps": 4476, "loss": 0.7431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999014807586154e-05, "epoch": 0.03, "percentage": 0.89, "elapsed_time": "0:15:30", "remaining_time": "1 day, 4:40:32"} +{"current_steps": 50, "total_steps": 4476, "loss": 0.6841, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9984606937283405e-05, "epoch": 0.03, "percentage": 1.12, "elapsed_time": "0:19:23", "remaining_time": "1 day, 4:36:46"} +{"current_steps": 60, "total_steps": 4476, "loss": 0.6452, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9977834990654804e-05, "epoch": 0.04, "percentage": 1.34, "elapsed_time": "0:23:17", "remaining_time": "1 day, 4:34:16"} +{"current_steps": 70, "total_steps": 4476, "loss": 0.6347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.99698325695798e-05, "epoch": 0.05, "percentage": 1.56, "elapsed_time": "0:27:12", "remaining_time": "1 day, 4:32:11"} +{"current_steps": 80, "total_steps": 4476, "loss": 0.6109, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9960600068278876e-05, "epoch": 0.05, "percentage": 1.79, "elapsed_time": "0:31:06", "remaining_time": "1 day, 4:29:22"} +{"current_steps": 90, "total_steps": 4476, "loss": 0.5911, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.995013794156957e-05, "epoch": 0.06, "percentage": 2.01, "elapsed_time": "0:34:59", "remaining_time": "1 day, 4:25:31"} +{"current_steps": 100, "total_steps": 4476, "loss": 0.5803, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.993844670484401e-05, "epoch": 0.07, "percentage": 2.23, "elapsed_time": "0:38:53", "remaining_time": "1 day, 4:22:01"} +{"current_steps": 110, "total_steps": 4476, "loss": 0.5902, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.992552693404354e-05, "epoch": 0.07, "percentage": 2.46, "elapsed_time": "0:42:46", "remaining_time": "1 day, 4:18:01"} +{"current_steps": 120, "total_steps": 4476, "loss": 0.5745, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.991137926563036e-05, "epoch": 0.08, "percentage": 2.68, "elapsed_time": "0:46:39", "remaining_time": "1 day, 4:13:43"} +{"current_steps": 130, "total_steps": 4476, "loss": 0.5538, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9896004396556176e-05, "epoch": 0.09, "percentage": 2.9, "elapsed_time": "0:50:33", "remaining_time": "1 day, 4:10:11"} +{"current_steps": 140, "total_steps": 4476, "loss": 0.5495, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.987940308422783e-05, "epoch": 0.09, "percentage": 3.13, "elapsed_time": "0:54:27", "remaining_time": "1 day, 4:06:25"} +{"current_steps": 150, "total_steps": 4476, "loss": 0.5433, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.986157614647005e-05, "epoch": 0.1, "percentage": 3.35, "elapsed_time": "0:58:20", "remaining_time": "1 day, 4:02:33"} +{"current_steps": 160, "total_steps": 4476, "loss": 0.548, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.984252446148508e-05, "epoch": 0.11, "percentage": 3.57, "elapsed_time": "1:02:14", "remaining_time": "1 day, 3:59:09"} +{"current_steps": 170, "total_steps": 4476, "loss": 0.5361, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.98222489678095e-05, "epoch": 0.11, "percentage": 3.8, "elapsed_time": "1:06:08", "remaining_time": "1 day, 3:55:13"} +{"current_steps": 180, "total_steps": 4476, "loss": 0.5331, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.980075066426796e-05, "epoch": 0.12, "percentage": 4.02, "elapsed_time": "1:10:01", "remaining_time": "1 day, 3:51:08"} +{"current_steps": 190, "total_steps": 4476, "loss": 0.53, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.977803060992393e-05, "epoch": 0.13, "percentage": 4.24, "elapsed_time": "1:13:54", "remaining_time": "1 day, 3:47:10"} +{"current_steps": 200, "total_steps": 4476, "loss": 0.5135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.97540899240276e-05, "epoch": 0.13, "percentage": 4.47, "elapsed_time": "1:17:47", "remaining_time": "1 day, 3:43:02"} +{"current_steps": 210, "total_steps": 4476, "loss": 0.5101, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.972892978596069e-05, "epoch": 0.14, "percentage": 4.69, "elapsed_time": "1:21:40", "remaining_time": "1 day, 3:39:04"} +{"current_steps": 220, "total_steps": 4476, "loss": 0.5125, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.970255143517838e-05, "epoch": 0.15, "percentage": 4.92, "elapsed_time": "1:25:33", "remaining_time": "1 day, 3:35:07"} +{"current_steps": 230, "total_steps": 4476, "loss": 0.4928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.967495617114826e-05, "epoch": 0.15, "percentage": 5.14, "elapsed_time": "1:29:26", "remaining_time": "1 day, 3:31:06"} +{"current_steps": 240, "total_steps": 4476, "loss": 0.4878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.964614535328626e-05, "epoch": 0.16, "percentage": 5.36, "elapsed_time": "1:33:19", "remaining_time": "1 day, 3:27:15"} +{"current_steps": 250, "total_steps": 4476, "loss": 0.5017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.961612040088973e-05, "epoch": 0.17, "percentage": 5.59, "elapsed_time": "1:37:13", "remaining_time": "1 day, 3:23:21"} +{"current_steps": 260, "total_steps": 4476, "loss": 0.4863, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9584882793067534e-05, "epoch": 0.17, "percentage": 5.81, "elapsed_time": "1:41:07", "remaining_time": "1 day, 3:19:42"} +{"current_steps": 270, "total_steps": 4476, "loss": 0.4847, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.955243406866713e-05, "epoch": 0.18, "percentage": 6.03, "elapsed_time": "1:45:00", "remaining_time": "1 day, 3:15:42"} +{"current_steps": 280, "total_steps": 4476, "loss": 0.4868, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.951877582619881e-05, "epoch": 0.19, "percentage": 6.26, "elapsed_time": "1:48:53", "remaining_time": "1 day, 3:11:49"} +{"current_steps": 290, "total_steps": 4476, "loss": 0.4748, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.948390972375694e-05, "epoch": 0.19, "percentage": 6.48, "elapsed_time": "1:52:46", "remaining_time": "1 day, 3:07:55"} +{"current_steps": 300, "total_steps": 4476, "loss": 0.4764, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.944783747893825e-05, "epoch": 0.2, "percentage": 6.7, "elapsed_time": "1:56:41", "remaining_time": "1 day, 3:04:15"} +{"current_steps": 310, "total_steps": 4476, "loss": 0.4712, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.941056086875727e-05, "epoch": 0.21, "percentage": 6.93, "elapsed_time": "2:00:35", "remaining_time": "1 day, 3:00:41"} +{"current_steps": 320, "total_steps": 4476, "loss": 0.4642, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.937208172955876e-05, "epoch": 0.21, "percentage": 7.15, "elapsed_time": "2:04:29", "remaining_time": "1 day, 2:56:50"} +{"current_steps": 330, "total_steps": 4476, "loss": 0.4642, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9332401956927224e-05, "epoch": 0.22, "percentage": 7.37, "elapsed_time": "2:08:23", "remaining_time": "1 day, 2:53:02"} +{"current_steps": 340, "total_steps": 4476, "loss": 0.4709, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9291523505593604e-05, "epoch": 0.23, "percentage": 7.6, "elapsed_time": "2:12:16", "remaining_time": "1 day, 2:49:01"} +{"current_steps": 350, "total_steps": 4476, "loss": 0.461, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9249448389338905e-05, "epoch": 0.23, "percentage": 7.82, "elapsed_time": "2:16:09", "remaining_time": "1 day, 2:45:10"} +{"current_steps": 360, "total_steps": 4476, "loss": 0.4677, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.920617868089501e-05, "epoch": 0.24, "percentage": 8.04, "elapsed_time": "2:20:04", "remaining_time": "1 day, 2:41:28"} +{"current_steps": 370, "total_steps": 4476, "loss": 0.4564, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9161716511842614e-05, "epoch": 0.25, "percentage": 8.27, "elapsed_time": "2:23:59", "remaining_time": "1 day, 2:37:52"} +{"current_steps": 380, "total_steps": 4476, "loss": 0.4663, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.911606407250617e-05, "epoch": 0.25, "percentage": 8.49, "elapsed_time": "2:27:52", "remaining_time": "1 day, 2:33:56"} +{"current_steps": 390, "total_steps": 4476, "loss": 0.4682, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9069223611846014e-05, "epoch": 0.26, "percentage": 8.71, "elapsed_time": "2:31:45", "remaining_time": "1 day, 2:30:00"} +{"current_steps": 400, "total_steps": 4476, "loss": 0.4636, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9021197437347555e-05, "epoch": 0.27, "percentage": 8.94, "elapsed_time": "2:35:38", "remaining_time": "1 day, 2:25:56"} +{"current_steps": 410, "total_steps": 4476, "loss": 0.4569, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.897198791490762e-05, "epoch": 0.27, "percentage": 9.16, "elapsed_time": "2:39:32", "remaining_time": "1 day, 2:22:10"} +{"current_steps": 420, "total_steps": 4476, "loss": 0.462, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8921597468717887e-05, "epoch": 0.28, "percentage": 9.38, "elapsed_time": "2:43:25", "remaining_time": "1 day, 2:18:17"} +{"current_steps": 430, "total_steps": 4476, "loss": 0.4563, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.887002858114548e-05, "epoch": 0.29, "percentage": 9.61, "elapsed_time": "2:47:20", "remaining_time": "1 day, 2:14:37"} +{"current_steps": 440, "total_steps": 4476, "loss": 0.4563, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.881728379261068e-05, "epoch": 0.29, "percentage": 9.83, "elapsed_time": "2:51:14", "remaining_time": "1 day, 2:10:44"} +{"current_steps": 450, "total_steps": 4476, "loss": 0.4468, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.876336570146175e-05, "epoch": 0.3, "percentage": 10.05, "elapsed_time": "2:55:07", "remaining_time": "1 day, 2:06:48"} +{"current_steps": 460, "total_steps": 4476, "loss": 0.4508, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.870827696384698e-05, "epoch": 0.31, "percentage": 10.28, "elapsed_time": "2:59:01", "remaining_time": "1 day, 2:02:54"} +{"current_steps": 470, "total_steps": 4476, "loss": 0.4507, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.865202029358379e-05, "epoch": 0.31, "percentage": 10.5, "elapsed_time": "3:02:54", "remaining_time": "1 day, 1:59:01"} +{"current_steps": 480, "total_steps": 4476, "loss": 0.4486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.859459846202507e-05, "epoch": 0.32, "percentage": 10.72, "elapsed_time": "3:06:49", "remaining_time": "1 day, 1:55:16"} +{"current_steps": 490, "total_steps": 4476, "loss": 0.4423, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.853601429792265e-05, "epoch": 0.33, "percentage": 10.95, "elapsed_time": "3:10:41", "remaining_time": "1 day, 1:51:11"} +{"current_steps": 500, "total_steps": 4476, "loss": 0.4369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.847627068728795e-05, "epoch": 0.34, "percentage": 11.17, "elapsed_time": "3:14:34", "remaining_time": "1 day, 1:47:12"} +{"current_steps": 510, "total_steps": 4476, "loss": 0.4429, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.841537057324979e-05, "epoch": 0.34, "percentage": 11.39, "elapsed_time": "3:18:28", "remaining_time": "1 day, 1:43:22"} +{"current_steps": 520, "total_steps": 4476, "loss": 0.4389, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.835331695590943e-05, "epoch": 0.35, "percentage": 11.62, "elapsed_time": "3:22:21", "remaining_time": "1 day, 1:39:30"} +{"current_steps": 530, "total_steps": 4476, "loss": 0.44, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.829011289219276e-05, "epoch": 0.36, "percentage": 11.84, "elapsed_time": "3:26:15", "remaining_time": "1 day, 1:35:39"} +{"current_steps": 540, "total_steps": 4476, "loss": 0.4476, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.82257614956997e-05, "epoch": 0.36, "percentage": 12.06, "elapsed_time": "3:30:09", "remaining_time": "1 day, 1:31:50"} +{"current_steps": 550, "total_steps": 4476, "loss": 0.4367, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.816026593655085e-05, "epoch": 0.37, "percentage": 12.29, "elapsed_time": "3:34:03", "remaining_time": "1 day, 1:27:59"} +{"current_steps": 560, "total_steps": 4476, "loss": 0.4357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.809362944123129e-05, "epoch": 0.38, "percentage": 12.51, "elapsed_time": "3:37:56", "remaining_time": "1 day, 1:24:02"} +{"current_steps": 570, "total_steps": 4476, "loss": 0.4492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.802585529243164e-05, "epoch": 0.38, "percentage": 12.73, "elapsed_time": "3:41:50", "remaining_time": "1 day, 1:20:13"} +{"current_steps": 580, "total_steps": 4476, "loss": 0.4403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.795694682888635e-05, "epoch": 0.39, "percentage": 12.96, "elapsed_time": "3:45:44", "remaining_time": "1 day, 1:16:22"} +{"current_steps": 590, "total_steps": 4476, "loss": 0.4406, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7886907445209234e-05, "epoch": 0.4, "percentage": 13.18, "elapsed_time": "3:49:37", "remaining_time": "1 day, 1:12:23"} +{"current_steps": 600, "total_steps": 4476, "loss": 0.4317, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.781574059172621e-05, "epoch": 0.4, "percentage": 13.4, "elapsed_time": "3:53:31", "remaining_time": "1 day, 1:08:31"} +{"current_steps": 610, "total_steps": 4476, "loss": 0.4379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7743449774305386e-05, "epoch": 0.41, "percentage": 13.63, "elapsed_time": "3:57:24", "remaining_time": "1 day, 1:04:37"} +{"current_steps": 620, "total_steps": 4476, "loss": 0.4324, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7670038554184296e-05, "epoch": 0.42, "percentage": 13.85, "elapsed_time": "4:01:19", "remaining_time": "1 day, 1:00:51"} +{"current_steps": 630, "total_steps": 4476, "loss": 0.4329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7595510547794465e-05, "epoch": 0.42, "percentage": 14.08, "elapsed_time": "4:05:12", "remaining_time": "1 day, 0:56:57"} +{"current_steps": 640, "total_steps": 4476, "loss": 0.4259, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.751986942658332e-05, "epoch": 0.43, "percentage": 14.3, "elapsed_time": "4:09:07", "remaining_time": "1 day, 0:53:09"} +{"current_steps": 650, "total_steps": 4476, "loss": 0.4256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.744311891683325e-05, "epoch": 0.44, "percentage": 14.52, "elapsed_time": "4:13:01", "remaining_time": "1 day, 0:49:19"} +{"current_steps": 660, "total_steps": 4476, "loss": 0.4289, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.736526279947807e-05, "epoch": 0.44, "percentage": 14.75, "elapsed_time": "4:16:55", "remaining_time": "1 day, 0:45:28"} +{"current_steps": 670, "total_steps": 4476, "loss": 0.4353, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.728630490991676e-05, "epoch": 0.45, "percentage": 14.97, "elapsed_time": "4:20:48", "remaining_time": "1 day, 0:41:32"} +{"current_steps": 680, "total_steps": 4476, "loss": 0.4413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7206249137824535e-05, "epoch": 0.46, "percentage": 15.19, "elapsed_time": "4:24:41", "remaining_time": "1 day, 0:37:37"} +{"current_steps": 690, "total_steps": 4476, "loss": 0.4302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7125099426961185e-05, "epoch": 0.46, "percentage": 15.42, "elapsed_time": "4:28:35", "remaining_time": "1 day, 0:33:45"} +{"current_steps": 700, "total_steps": 4476, "loss": 0.4365, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.704285977497687e-05, "epoch": 0.47, "percentage": 15.64, "elapsed_time": "4:32:28", "remaining_time": "1 day, 0:29:49"} +{"current_steps": 710, "total_steps": 4476, "loss": 0.4238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6959534233215116e-05, "epoch": 0.48, "percentage": 15.86, "elapsed_time": "4:36:22", "remaining_time": "1 day, 0:25:56"} +{"current_steps": 720, "total_steps": 4476, "loss": 0.4284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.687512690651328e-05, "epoch": 0.48, "percentage": 16.09, "elapsed_time": "4:40:16", "remaining_time": "1 day, 0:22:04"} +{"current_steps": 730, "total_steps": 4476, "loss": 0.4193, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.678964195300028e-05, "epoch": 0.49, "percentage": 16.31, "elapsed_time": "4:44:09", "remaining_time": "1 day, 0:18:07"} +{"current_steps": 740, "total_steps": 4476, "loss": 0.4256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.670308358389184e-05, "epoch": 0.5, "percentage": 16.53, "elapsed_time": "4:48:01", "remaining_time": "1 day, 0:14:10"} +{"current_steps": 750, "total_steps": 4476, "loss": 0.4288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6615456063282944e-05, "epoch": 0.5, "percentage": 16.76, "elapsed_time": "4:51:53", "remaining_time": "1 day, 0:10:08"} +{"current_steps": 760, "total_steps": 4476, "loss": 0.4335, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.652676370793784e-05, "epoch": 0.51, "percentage": 16.98, "elapsed_time": "4:55:47", "remaining_time": "1 day, 0:06:16"} +{"current_steps": 770, "total_steps": 4476, "loss": 0.4271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.643701088707736e-05, "epoch": 0.52, "percentage": 17.2, "elapsed_time": "4:59:42", "remaining_time": "1 day, 0:02:31"} +{"current_steps": 780, "total_steps": 4476, "loss": 0.4304, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.634620202216366e-05, "epoch": 0.52, "percentage": 17.43, "elapsed_time": "5:03:37", "remaining_time": "23:58:41"} +{"current_steps": 790, "total_steps": 4476, "loss": 0.4249, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.625434158668246e-05, "epoch": 0.53, "percentage": 17.65, "elapsed_time": "5:07:30", "remaining_time": "23:54:48"} +{"current_steps": 800, "total_steps": 4476, "loss": 0.4322, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6161434105922616e-05, "epoch": 0.54, "percentage": 17.87, "elapsed_time": "5:11:24", "remaining_time": "23:50:53"} +{"current_steps": 810, "total_steps": 4476, "loss": 0.4229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6067484156753234e-05, "epoch": 0.54, "percentage": 18.1, "elapsed_time": "5:15:17", "remaining_time": "23:46:59"} +{"current_steps": 820, "total_steps": 4476, "loss": 0.4252, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.597249636739815e-05, "epoch": 0.55, "percentage": 18.32, "elapsed_time": "5:19:10", "remaining_time": "23:43:01"} +{"current_steps": 830, "total_steps": 4476, "loss": 0.413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5876475417207974e-05, "epoch": 0.56, "percentage": 18.54, "elapsed_time": "5:23:03", "remaining_time": "23:39:08"} +{"current_steps": 840, "total_steps": 4476, "loss": 0.4186, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.577942603642959e-05, "epoch": 0.56, "percentage": 18.77, "elapsed_time": "5:26:58", "remaining_time": "23:35:19"} +{"current_steps": 850, "total_steps": 4476, "loss": 0.4233, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.568135300597306e-05, "epoch": 0.57, "percentage": 18.99, "elapsed_time": "5:30:51", "remaining_time": "23:31:25"} +{"current_steps": 860, "total_steps": 4476, "loss": 0.4177, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5582261157176164e-05, "epoch": 0.58, "percentage": 19.21, "elapsed_time": "5:34:45", "remaining_time": "23:27:31"} +{"current_steps": 870, "total_steps": 4476, "loss": 0.4236, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5482155371566384e-05, "epoch": 0.58, "percentage": 19.44, "elapsed_time": "5:38:39", "remaining_time": "23:23:38"} +{"current_steps": 880, "total_steps": 4476, "loss": 0.4228, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.538104058062042e-05, "epoch": 0.59, "percentage": 19.66, "elapsed_time": "5:42:32", "remaining_time": "23:19:46"} +{"current_steps": 890, "total_steps": 4476, "loss": 0.4181, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5278921765521234e-05, "epoch": 0.6, "percentage": 19.88, "elapsed_time": "5:46:26", "remaining_time": "23:15:52"} +{"current_steps": 900, "total_steps": 4476, "loss": 0.4261, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.51758039569127e-05, "epoch": 0.6, "percentage": 20.11, "elapsed_time": "5:50:18", "remaining_time": "23:11:54"} +{"current_steps": 910, "total_steps": 4476, "loss": 0.4217, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5071692234651764e-05, "epoch": 0.61, "percentage": 20.33, "elapsed_time": "5:54:12", "remaining_time": "23:08:00"} +{"current_steps": 920, "total_steps": 4476, "loss": 0.4191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4966591727558184e-05, "epoch": 0.62, "percentage": 20.55, "elapsed_time": "5:58:06", "remaining_time": "23:04:09"} +{"current_steps": 930, "total_steps": 4476, "loss": 0.4247, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.48605076131619e-05, "epoch": 0.62, "percentage": 20.78, "elapsed_time": "6:02:00", "remaining_time": "23:00:18"} +{"current_steps": 940, "total_steps": 4476, "loss": 0.4236, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.475344511744794e-05, "epoch": 0.63, "percentage": 21.0, "elapsed_time": "6:05:53", "remaining_time": "22:56:24"} +{"current_steps": 950, "total_steps": 4476, "loss": 0.4172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.464540951459902e-05, "epoch": 0.64, "percentage": 21.22, "elapsed_time": "6:09:47", "remaining_time": "22:52:31"} +{"current_steps": 960, "total_steps": 4476, "loss": 0.4209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4536406126735664e-05, "epoch": 0.64, "percentage": 21.45, "elapsed_time": "6:13:42", "remaining_time": "22:48:41"} +{"current_steps": 970, "total_steps": 4476, "loss": 0.4179, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.442644032365407e-05, "epoch": 0.65, "percentage": 21.67, "elapsed_time": "6:17:36", "remaining_time": "22:44:48"} +{"current_steps": 980, "total_steps": 4476, "loss": 0.4166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.431551752256155e-05, "epoch": 0.66, "percentage": 21.89, "elapsed_time": "6:21:30", "remaining_time": "22:40:59"} +{"current_steps": 990, "total_steps": 4476, "loss": 0.4173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.420364318780973e-05, "epoch": 0.66, "percentage": 22.12, "elapsed_time": "6:25:25", "remaining_time": "22:37:10"} +{"current_steps": 1000, "total_steps": 4476, "loss": 0.4166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4090822830625236e-05, "epoch": 0.67, "percentage": 22.34, "elapsed_time": "6:29:20", "remaining_time": "22:33:19"} +{"current_steps": 1010, "total_steps": 4476, "loss": 0.4173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3977062008838307e-05, "epoch": 0.68, "percentage": 22.56, "elapsed_time": "6:33:14", "remaining_time": "22:29:27"} +{"current_steps": 1020, "total_steps": 4476, "loss": 0.4049, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3862366326608975e-05, "epoch": 0.68, "percentage": 22.79, "elapsed_time": "6:37:06", "remaining_time": "22:25:30"} +{"current_steps": 1030, "total_steps": 4476, "loss": 0.4143, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.374674143415096e-05, "epoch": 0.69, "percentage": 23.01, "elapsed_time": "6:41:00", "remaining_time": "22:21:36"} +{"current_steps": 1040, "total_steps": 4476, "loss": 0.4219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.363019302745334e-05, "epoch": 0.7, "percentage": 23.24, "elapsed_time": "6:44:52", "remaining_time": "22:17:39"} +{"current_steps": 1050, "total_steps": 4476, "loss": 0.4152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3512726847999987e-05, "epoch": 0.7, "percentage": 23.46, "elapsed_time": "6:48:47", "remaining_time": "22:13:48"} +{"current_steps": 1060, "total_steps": 4476, "loss": 0.4153, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.339434868248665e-05, "epoch": 0.71, "percentage": 23.68, "elapsed_time": "6:52:40", "remaining_time": "22:09:53"} +{"current_steps": 1070, "total_steps": 4476, "loss": 0.4148, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3275064362535966e-05, "epoch": 0.72, "percentage": 23.91, "elapsed_time": "6:56:33", "remaining_time": "22:05:58"} +{"current_steps": 1080, "total_steps": 4476, "loss": 0.4147, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.315487976441014e-05, "epoch": 0.72, "percentage": 24.13, "elapsed_time": "7:00:25", "remaining_time": "22:01:59"} +{"current_steps": 1090, "total_steps": 4476, "loss": 0.41, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.303380080872145e-05, "epoch": 0.73, "percentage": 24.35, "elapsed_time": "7:04:20", "remaining_time": "21:58:09"} +{"current_steps": 1100, "total_steps": 4476, "loss": 0.4119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.291183346014063e-05, "epoch": 0.74, "percentage": 24.58, "elapsed_time": "7:08:13", "remaining_time": "21:54:15"} +{"current_steps": 1110, "total_steps": 4476, "loss": 0.4173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.278898372710296e-05, "epoch": 0.74, "percentage": 24.8, "elapsed_time": "7:12:08", "remaining_time": "21:50:25"} +{"current_steps": 1120, "total_steps": 4476, "loss": 0.4119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.266525766151238e-05, "epoch": 0.75, "percentage": 25.02, "elapsed_time": "7:16:01", "remaining_time": "21:46:32"} +{"current_steps": 1130, "total_steps": 4476, "loss": 0.4163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.254066135844326e-05, "epoch": 0.76, "percentage": 25.25, "elapsed_time": "7:19:54", "remaining_time": "21:42:37"} +{"current_steps": 1140, "total_steps": 4476, "loss": 0.4104, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2415200955840184e-05, "epoch": 0.76, "percentage": 25.47, "elapsed_time": "7:23:47", "remaining_time": "21:38:41"} +{"current_steps": 1150, "total_steps": 4476, "loss": 0.4045, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.228888263421557e-05, "epoch": 0.77, "percentage": 25.69, "elapsed_time": "7:27:40", "remaining_time": "21:34:46"} +{"current_steps": 1160, "total_steps": 4476, "loss": 0.413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.216171261634521e-05, "epoch": 0.78, "percentage": 25.92, "elapsed_time": "7:31:33", "remaining_time": "21:30:51"} +{"current_steps": 1170, "total_steps": 4476, "loss": 0.4112, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2033697166961716e-05, "epoch": 0.78, "percentage": 26.14, "elapsed_time": "7:35:27", "remaining_time": "21:26:57"} +{"current_steps": 1180, "total_steps": 4476, "loss": 0.4018, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1904842592445906e-05, "epoch": 0.79, "percentage": 26.36, "elapsed_time": "7:39:21", "remaining_time": "21:23:05"} +{"current_steps": 1190, "total_steps": 4476, "loss": 0.4068, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.177515524051609e-05, "epoch": 0.8, "percentage": 26.59, "elapsed_time": "7:43:15", "remaining_time": "21:19:11"} +{"current_steps": 1200, "total_steps": 4476, "loss": 0.4029, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1644641499915454e-05, "epoch": 0.8, "percentage": 26.81, "elapsed_time": "7:47:07", "remaining_time": "21:15:14"} +{"current_steps": 1210, "total_steps": 4476, "loss": 0.4009, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.151330780009726e-05, "epoch": 0.81, "percentage": 27.03, "elapsed_time": "7:51:02", "remaining_time": "21:11:26"} +{"current_steps": 1220, "total_steps": 4476, "loss": 0.4073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1381160610908134e-05, "epoch": 0.82, "percentage": 27.26, "elapsed_time": "7:54:56", "remaining_time": "21:07:33"} +{"current_steps": 1230, "total_steps": 4476, "loss": 0.4138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.124820644226936e-05, "epoch": 0.82, "percentage": 27.48, "elapsed_time": "7:58:50", "remaining_time": "21:03:40"} +{"current_steps": 1240, "total_steps": 4476, "loss": 0.4139, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.111445184385616e-05, "epoch": 0.83, "percentage": 27.7, "elapsed_time": "8:02:44", "remaining_time": "20:59:47"} +{"current_steps": 1250, "total_steps": 4476, "loss": 0.4062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.097990340477507e-05, "epoch": 0.84, "percentage": 27.93, "elapsed_time": "8:06:37", "remaining_time": "20:55:53"} +{"current_steps": 1260, "total_steps": 4476, "loss": 0.4044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0844567753239276e-05, "epoch": 0.84, "percentage": 28.15, "elapsed_time": "8:10:31", "remaining_time": "20:52:00"} +{"current_steps": 1270, "total_steps": 4476, "loss": 0.3978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.070845155624221e-05, "epoch": 0.85, "percentage": 28.37, "elapsed_time": "8:14:23", "remaining_time": "20:48:03"} +{"current_steps": 1280, "total_steps": 4476, "loss": 0.4102, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0571561519228984e-05, "epoch": 0.86, "percentage": 28.6, "elapsed_time": "8:18:17", "remaining_time": "20:44:10"} +{"current_steps": 1290, "total_steps": 4476, "loss": 0.4052, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.043390438576616e-05, "epoch": 0.86, "percentage": 28.82, "elapsed_time": "8:22:10", "remaining_time": "20:40:15"} +{"current_steps": 1300, "total_steps": 4476, "loss": 0.4048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.029548693720949e-05, "epoch": 0.87, "percentage": 29.04, "elapsed_time": "8:26:04", "remaining_time": "20:36:22"} +{"current_steps": 1310, "total_steps": 4476, "loss": 0.4008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0156315992369864e-05, "epoch": 0.88, "percentage": 29.27, "elapsed_time": "8:29:57", "remaining_time": "20:32:28"} +{"current_steps": 1320, "total_steps": 4476, "loss": 0.4038, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.001639840717741e-05, "epoch": 0.88, "percentage": 29.49, "elapsed_time": "8:33:50", "remaining_time": "20:28:33"} +{"current_steps": 1330, "total_steps": 4476, "loss": 0.408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9875741074343744e-05, "epoch": 0.89, "percentage": 29.71, "elapsed_time": "8:37:45", "remaining_time": "20:24:43"} +{"current_steps": 1340, "total_steps": 4476, "loss": 0.406, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.973435092302239e-05, "epoch": 0.9, "percentage": 29.94, "elapsed_time": "8:41:39", "remaining_time": "20:20:50"} +{"current_steps": 1350, "total_steps": 4476, "loss": 0.3991, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.959223491846749e-05, "epoch": 0.9, "percentage": 30.16, "elapsed_time": "8:45:31", "remaining_time": "20:16:53"} +{"current_steps": 1360, "total_steps": 4476, "loss": 0.4091, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.94494000616906e-05, "epoch": 0.91, "percentage": 30.38, "elapsed_time": "8:49:24", "remaining_time": "20:12:57"} +{"current_steps": 1370, "total_steps": 4476, "loss": 0.4, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.93058533891159e-05, "epoch": 0.92, "percentage": 30.61, "elapsed_time": "8:53:16", "remaining_time": "20:09:01"} +{"current_steps": 1380, "total_steps": 4476, "loss": 0.4112, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.916160197223344e-05, "epoch": 0.92, "percentage": 30.83, "elapsed_time": "8:57:09", "remaining_time": "20:05:06"} +{"current_steps": 1390, "total_steps": 4476, "loss": 0.4024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.901665291725091e-05, "epoch": 0.93, "percentage": 31.05, "elapsed_time": "9:01:03", "remaining_time": "20:01:12"} +{"current_steps": 1400, "total_steps": 4476, "loss": 0.4048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.887101336474346e-05, "epoch": 0.94, "percentage": 31.28, "elapsed_time": "9:04:56", "remaining_time": "19:57:18"} +{"current_steps": 1410, "total_steps": 4476, "loss": 0.4112, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8724690489302004e-05, "epoch": 0.94, "percentage": 31.5, "elapsed_time": "9:08:50", "remaining_time": "19:53:26"} +{"current_steps": 1420, "total_steps": 4476, "loss": 0.3947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.857769149917973e-05, "epoch": 0.95, "percentage": 31.72, "elapsed_time": "9:12:43", "remaining_time": "19:49:31"} +{"current_steps": 1430, "total_steps": 4476, "loss": 0.4005, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.843002363593707e-05, "epoch": 0.96, "percentage": 31.95, "elapsed_time": "9:16:36", "remaining_time": "19:45:36"} +{"current_steps": 1440, "total_steps": 4476, "loss": 0.3976, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.828169417408488e-05, "epoch": 0.96, "percentage": 32.17, "elapsed_time": "9:20:29", "remaining_time": "19:41:42"} +{"current_steps": 1450, "total_steps": 4476, "loss": 0.4006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8132710420726146e-05, "epoch": 0.97, "percentage": 32.39, "elapsed_time": "9:24:24", "remaining_time": "19:37:50"} +{"current_steps": 1460, "total_steps": 4476, "loss": 0.398, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7983079715195984e-05, "epoch": 0.98, "percentage": 32.62, "elapsed_time": "9:28:18", "remaining_time": "19:33:59"} +{"current_steps": 1470, "total_steps": 4476, "loss": 0.3987, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.78328094287001e-05, "epoch": 0.99, "percentage": 32.84, "elapsed_time": "9:32:12", "remaining_time": "19:30:06"} +{"current_steps": 1480, "total_steps": 4476, "loss": 0.4013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.768190696395162e-05, "epoch": 0.99, "percentage": 33.07, "elapsed_time": "9:36:06", "remaining_time": "19:26:14"} +{"current_steps": 1490, "total_steps": 4476, "loss": 0.4028, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7530379754806494e-05, "epoch": 1.0, "percentage": 33.29, "elapsed_time": "9:39:59", "remaining_time": "19:22:18"} +{"current_steps": 1500, "total_steps": 4476, "loss": 0.4036, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.737823526589722e-05, "epoch": 1.01, "percentage": 33.51, "elapsed_time": "9:43:53", "remaining_time": "19:18:25"} +{"current_steps": 1510, "total_steps": 4476, "loss": 0.3937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7225480992265125e-05, "epoch": 1.01, "percentage": 33.74, "elapsed_time": "9:47:47", "remaining_time": "19:14:33"} +{"current_steps": 1520, "total_steps": 4476, "loss": 0.4007, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.707212445899116e-05, "epoch": 1.02, "percentage": 33.96, "elapsed_time": "9:51:40", "remaining_time": "19:10:38"} +{"current_steps": 1530, "total_steps": 4476, "loss": 0.4004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6918173220825204e-05, "epoch": 1.03, "percentage": 34.18, "elapsed_time": "9:55:33", "remaining_time": "19:06:45"} +{"current_steps": 1540, "total_steps": 4476, "loss": 0.4004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6763634861813836e-05, "epoch": 1.03, "percentage": 34.41, "elapsed_time": "9:59:28", "remaining_time": "19:02:53"} +{"current_steps": 1550, "total_steps": 4476, "loss": 0.3991, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.660851699492679e-05, "epoch": 1.04, "percentage": 34.63, "elapsed_time": "10:03:22", "remaining_time": "18:59:00"} +{"current_steps": 1560, "total_steps": 4476, "loss": 0.4042, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.645282726168191e-05, "epoch": 1.05, "percentage": 34.85, "elapsed_time": "10:07:17", "remaining_time": "18:55:10"} +{"current_steps": 1570, "total_steps": 4476, "loss": 0.4043, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6296573331768664e-05, "epoch": 1.05, "percentage": 35.08, "elapsed_time": "10:11:12", "remaining_time": "18:51:18"} +{"current_steps": 1580, "total_steps": 4476, "loss": 0.3948, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.613976290267036e-05, "epoch": 1.06, "percentage": 35.3, "elapsed_time": "10:15:06", "remaining_time": "18:47:25"} +{"current_steps": 1590, "total_steps": 4476, "loss": 0.3952, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.598240369928494e-05, "epoch": 1.07, "percentage": 35.52, "elapsed_time": "10:19:00", "remaining_time": "18:43:33"} +{"current_steps": 1600, "total_steps": 4476, "loss": 0.4002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5824503473544405e-05, "epoch": 1.07, "percentage": 35.75, "elapsed_time": "10:22:53", "remaining_time": "18:39:38"} +{"current_steps": 1610, "total_steps": 4476, "loss": 0.4079, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.566607000403298e-05, "epoch": 1.08, "percentage": 35.97, "elapsed_time": "10:26:46", "remaining_time": "18:35:44"} +{"current_steps": 1620, "total_steps": 4476, "loss": 0.3942, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5523030408223166e-05, "epoch": 1.09, "percentage": 36.19, "elapsed_time": "10:30:40", "remaining_time": "18:31:50"} +{"current_steps": 1630, "total_steps": 4476, "loss": 0.3935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5363605299319165e-05, "epoch": 1.09, "percentage": 36.42, "elapsed_time": "10:34:33", "remaining_time": "18:27:56"} +{"current_steps": 1640, "total_steps": 4476, "loss": 0.3898, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.520366965171161e-05, "epoch": 1.1, "percentage": 36.64, "elapsed_time": "10:38:26", "remaining_time": "18:24:02"} +{"current_steps": 1650, "total_steps": 4476, "loss": 0.4006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.504323134425501e-05, "epoch": 1.11, "percentage": 36.86, "elapsed_time": "10:42:21", "remaining_time": "18:20:10"} +{"current_steps": 1660, "total_steps": 4476, "loss": 0.4089, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.48822982805662e-05, "epoch": 1.11, "percentage": 37.09, "elapsed_time": "10:46:15", "remaining_time": "18:16:18"} +{"current_steps": 1670, "total_steps": 4476, "loss": 0.3982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.472087838863505e-05, "epoch": 1.12, "percentage": 37.31, "elapsed_time": "10:50:08", "remaining_time": "18:12:23"} +{"current_steps": 1680, "total_steps": 4476, "loss": 0.399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.455897962043387e-05, "epoch": 1.13, "percentage": 37.53, "elapsed_time": "10:54:01", "remaining_time": "18:08:29"} +{"current_steps": 1690, "total_steps": 4476, "loss": 0.3964, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4396609951525676e-05, "epoch": 1.13, "percentage": 37.76, "elapsed_time": "10:57:54", "remaining_time": "18:04:34"} +{"current_steps": 1700, "total_steps": 4476, "loss": 0.3909, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.423377738067132e-05, "epoch": 1.14, "percentage": 37.98, "elapsed_time": "11:01:47", "remaining_time": "18:00:40"} +{"current_steps": 1710, "total_steps": 4476, "loss": 0.4015, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.407048992943541e-05, "epoch": 1.15, "percentage": 38.2, "elapsed_time": "11:05:41", "remaining_time": "17:56:47"} +{"current_steps": 1720, "total_steps": 4476, "loss": 0.3915, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.39067556417912e-05, "epoch": 1.15, "percentage": 38.43, "elapsed_time": "11:09:34", "remaining_time": "17:52:52"} +{"current_steps": 1730, "total_steps": 4476, "loss": 0.3845, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.374258258372426e-05, "epoch": 1.16, "percentage": 38.65, "elapsed_time": "11:13:27", "remaining_time": "17:48:58"} +{"current_steps": 1740, "total_steps": 4476, "loss": 0.4018, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.357797884283517e-05, "epoch": 1.17, "percentage": 38.87, "elapsed_time": "11:17:21", "remaining_time": "17:45:05"} +{"current_steps": 1750, "total_steps": 4476, "loss": 0.3914, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3412952527941096e-05, "epoch": 1.17, "percentage": 39.1, "elapsed_time": "11:21:14", "remaining_time": "17:41:11"} +{"current_steps": 1760, "total_steps": 4476, "loss": 0.3909, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.32475117686763e-05, "epoch": 1.18, "percentage": 39.32, "elapsed_time": "11:25:07", "remaining_time": "17:37:16"} +{"current_steps": 1770, "total_steps": 4476, "loss": 0.3993, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.308166471509171e-05, "epoch": 1.19, "percentage": 39.54, "elapsed_time": "11:29:00", "remaining_time": "17:33:21"} +{"current_steps": 1780, "total_steps": 4476, "loss": 0.3906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2915419537253346e-05, "epoch": 1.19, "percentage": 39.77, "elapsed_time": "11:32:53", "remaining_time": "17:29:27"} +{"current_steps": 1790, "total_steps": 4476, "loss": 0.3897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.274878442483991e-05, "epoch": 1.2, "percentage": 39.99, "elapsed_time": "11:36:47", "remaining_time": "17:25:34"} +{"current_steps": 1800, "total_steps": 4476, "loss": 0.3954, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.258176758673932e-05, "epoch": 1.21, "percentage": 40.21, "elapsed_time": "11:40:41", "remaining_time": "17:21:41"} +{"current_steps": 1810, "total_steps": 4476, "loss": 0.3835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.241437725064431e-05, "epoch": 1.21, "percentage": 40.44, "elapsed_time": "11:44:35", "remaining_time": "17:17:48"} +{"current_steps": 1820, "total_steps": 4476, "loss": 0.3854, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.224662166264711e-05, "epoch": 1.22, "percentage": 40.66, "elapsed_time": "11:48:29", "remaining_time": "17:13:55"} +{"current_steps": 1830, "total_steps": 4476, "loss": 0.3924, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.207850908683322e-05, "epoch": 1.23, "percentage": 40.88, "elapsed_time": "11:52:22", "remaining_time": "17:10:01"} +{"current_steps": 1840, "total_steps": 4476, "loss": 0.3888, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.191004780487434e-05, "epoch": 1.23, "percentage": 41.11, "elapsed_time": "11:56:16", "remaining_time": "17:06:08"} +{"current_steps": 1850, "total_steps": 4476, "loss": 0.3914, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1741246115620336e-05, "epoch": 1.24, "percentage": 41.33, "elapsed_time": "12:00:10", "remaining_time": "17:02:15"} +{"current_steps": 1860, "total_steps": 4476, "loss": 0.391, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.157211233469042e-05, "epoch": 1.25, "percentage": 41.55, "elapsed_time": "12:04:05", "remaining_time": "16:58:23"} +{"current_steps": 1870, "total_steps": 4476, "loss": 0.3916, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.140265479406358e-05, "epoch": 1.25, "percentage": 41.78, "elapsed_time": "12:07:58", "remaining_time": "16:54:29"} +{"current_steps": 1880, "total_steps": 4476, "loss": 0.4012, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1232881841668015e-05, "epoch": 1.26, "percentage": 42.0, "elapsed_time": "12:11:51", "remaining_time": "16:50:34"} +{"current_steps": 1890, "total_steps": 4476, "loss": 0.3934, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.106280184096996e-05, "epoch": 1.27, "percentage": 42.23, "elapsed_time": "12:15:43", "remaining_time": "16:46:39"} +{"current_steps": 1900, "total_steps": 4476, "loss": 0.3908, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.089242317056168e-05, "epoch": 1.27, "percentage": 42.45, "elapsed_time": "12:19:37", "remaining_time": "16:42:46"} +{"current_steps": 1910, "total_steps": 4476, "loss": 0.3972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.072175422374867e-05, "epoch": 1.28, "percentage": 42.67, "elapsed_time": "12:23:30", "remaining_time": "16:38:52"} +{"current_steps": 1920, "total_steps": 4476, "loss": 0.3963, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.055080340813623e-05, "epoch": 1.29, "percentage": 42.9, "elapsed_time": "12:27:25", "remaining_time": "16:35:00"} +{"current_steps": 1930, "total_steps": 4476, "loss": 0.3941, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0379579145215287e-05, "epoch": 1.29, "percentage": 43.12, "elapsed_time": "12:31:20", "remaining_time": "16:31:08"} +{"current_steps": 1940, "total_steps": 4476, "loss": 0.3887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0208089869947475e-05, "epoch": 1.3, "percentage": 43.34, "elapsed_time": "12:35:13", "remaining_time": "16:27:14"} +{"current_steps": 1950, "total_steps": 4476, "loss": 0.3879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0036344030349644e-05, "epoch": 1.31, "percentage": 43.57, "elapsed_time": "12:39:06", "remaining_time": "16:23:20"} +{"current_steps": 1960, "total_steps": 4476, "loss": 0.3945, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9864350087077702e-05, "epoch": 1.31, "percentage": 43.79, "elapsed_time": "12:42:59", "remaining_time": "16:19:26"} +{"current_steps": 1970, "total_steps": 4476, "loss": 0.3909, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.969211651300978e-05, "epoch": 1.32, "percentage": 44.01, "elapsed_time": "12:46:54", "remaining_time": "16:15:33"} +{"current_steps": 1980, "total_steps": 4476, "loss": 0.3871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9519651792828877e-05, "epoch": 1.33, "percentage": 44.24, "elapsed_time": "12:50:47", "remaining_time": "16:11:39"} +{"current_steps": 1990, "total_steps": 4476, "loss": 0.3803, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9346964422604846e-05, "epoch": 1.33, "percentage": 44.46, "elapsed_time": "12:54:39", "remaining_time": "16:07:44"} +{"current_steps": 2000, "total_steps": 4476, "loss": 0.3868, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9174062909375892e-05, "epoch": 1.34, "percentage": 44.68, "elapsed_time": "12:58:33", "remaining_time": "16:03:51"} +{"current_steps": 2010, "total_steps": 4476, "loss": 0.385, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9000955770729464e-05, "epoch": 1.35, "percentage": 44.91, "elapsed_time": "13:02:26", "remaining_time": "15:59:56"} +{"current_steps": 2020, "total_steps": 4476, "loss": 0.3871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8827651534382655e-05, "epoch": 1.35, "percentage": 45.13, "elapsed_time": "13:06:19", "remaining_time": "15:56:03"} +{"current_steps": 2030, "total_steps": 4476, "loss": 0.3956, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8654158737762122e-05, "epoch": 1.36, "percentage": 45.35, "elapsed_time": "13:10:13", "remaining_time": "15:52:09"} +{"current_steps": 2040, "total_steps": 4476, "loss": 0.3884, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8480485927583506e-05, "epoch": 1.37, "percentage": 45.58, "elapsed_time": "13:14:06", "remaining_time": "15:48:14"} +{"current_steps": 2050, "total_steps": 4476, "loss": 0.3829, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8306641659430382e-05, "epoch": 1.37, "percentage": 45.8, "elapsed_time": "13:17:59", "remaining_time": "15:44:21"} +{"current_steps": 2060, "total_steps": 4476, "loss": 0.3916, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8132634497332815e-05, "epoch": 1.38, "percentage": 46.02, "elapsed_time": "13:21:54", "remaining_time": "15:40:29"} +{"current_steps": 2070, "total_steps": 4476, "loss": 0.3924, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7958473013345447e-05, "epoch": 1.39, "percentage": 46.25, "elapsed_time": "13:25:47", "remaining_time": "15:36:35"} +{"current_steps": 2080, "total_steps": 4476, "loss": 0.3906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7784165787125226e-05, "epoch": 1.39, "percentage": 46.47, "elapsed_time": "13:29:39", "remaining_time": "15:32:40"} +{"current_steps": 2090, "total_steps": 4476, "loss": 0.383, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7609721405508758e-05, "epoch": 1.4, "percentage": 46.69, "elapsed_time": "13:33:32", "remaining_time": "15:28:46"} +{"current_steps": 2100, "total_steps": 4476, "loss": 0.3892, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7435148462089282e-05, "epoch": 1.41, "percentage": 46.92, "elapsed_time": "13:37:25", "remaining_time": "15:24:51"} +{"current_steps": 2110, "total_steps": 4476, "loss": 0.3866, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7260455556793325e-05, "epoch": 1.41, "percentage": 47.14, "elapsed_time": "13:41:20", "remaining_time": "15:20:59"} +{"current_steps": 2120, "total_steps": 4476, "loss": 0.382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.708565129545706e-05, "epoch": 1.42, "percentage": 47.36, "elapsed_time": "13:45:13", "remaining_time": "15:17:05"} +{"current_steps": 2130, "total_steps": 4476, "loss": 0.3825, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.691074428940237e-05, "epoch": 1.43, "percentage": 47.59, "elapsed_time": "13:49:06", "remaining_time": "15:13:11"} +{"current_steps": 2140, "total_steps": 4476, "loss": 0.3828, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.673574315501259e-05, "epoch": 1.43, "percentage": 47.81, "elapsed_time": "13:53:00", "remaining_time": "15:09:17"} +{"current_steps": 2150, "total_steps": 4476, "loss": 0.3845, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.656065651330808e-05, "epoch": 1.44, "percentage": 48.03, "elapsed_time": "13:56:53", "remaining_time": "15:05:23"} +{"current_steps": 2160, "total_steps": 4476, "loss": 0.3904, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6385492989521522e-05, "epoch": 1.45, "percentage": 48.26, "elapsed_time": "14:00:46", "remaining_time": "15:01:29"} +{"current_steps": 2170, "total_steps": 4476, "loss": 0.3934, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6210261212673004e-05, "epoch": 1.45, "percentage": 48.48, "elapsed_time": "14:04:39", "remaining_time": "14:57:35"} +{"current_steps": 2180, "total_steps": 4476, "loss": 0.3893, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6034969815144938e-05, "epoch": 1.46, "percentage": 48.7, "elapsed_time": "14:08:33", "remaining_time": "14:53:43"} +{"current_steps": 2190, "total_steps": 4476, "loss": 0.3965, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5859627432256816e-05, "epoch": 1.47, "percentage": 48.93, "elapsed_time": "14:12:26", "remaining_time": "14:49:48"} +{"current_steps": 2200, "total_steps": 4476, "loss": 0.3833, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.568424270183981e-05, "epoch": 1.47, "percentage": 49.15, "elapsed_time": "14:16:20", "remaining_time": "14:45:55"} +{"current_steps": 2210, "total_steps": 4476, "loss": 0.3822, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.550882426381123e-05, "epoch": 1.48, "percentage": 49.37, "elapsed_time": "14:20:13", "remaining_time": "14:42:01"} +{"current_steps": 2220, "total_steps": 4476, "loss": 0.3823, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5333380759748925e-05, "epoch": 1.49, "percentage": 49.6, "elapsed_time": "14:24:06", "remaining_time": "14:38:07"} +{"current_steps": 2230, "total_steps": 4476, "loss": 0.3861, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.515792083246556e-05, "epoch": 1.49, "percentage": 49.82, "elapsed_time": "14:27:59", "remaining_time": "14:34:13"} +{"current_steps": 2240, "total_steps": 4476, "loss": 0.3861, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4982453125582834e-05, "epoch": 1.5, "percentage": 50.04, "elapsed_time": "14:31:51", "remaining_time": "14:30:18"} +{"current_steps": 2250, "total_steps": 4476, "loss": 0.3909, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4806986283105712e-05, "epoch": 1.51, "percentage": 50.27, "elapsed_time": "14:35:45", "remaining_time": "14:26:25"} +{"current_steps": 2260, "total_steps": 4476, "loss": 0.3854, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.463152894899658e-05, "epoch": 1.51, "percentage": 50.49, "elapsed_time": "14:39:39", "remaining_time": "14:22:32"} +{"current_steps": 2270, "total_steps": 4476, "loss": 0.394, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.445608976674939e-05, "epoch": 1.52, "percentage": 50.71, "elapsed_time": "14:43:33", "remaining_time": "14:18:38"} +{"current_steps": 2280, "total_steps": 4476, "loss": 0.3866, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4280677378963906e-05, "epoch": 1.53, "percentage": 50.94, "elapsed_time": "14:47:26", "remaining_time": "14:14:45"} +{"current_steps": 2290, "total_steps": 4476, "loss": 0.3879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.410530042691992e-05, "epoch": 1.53, "percentage": 51.16, "elapsed_time": "14:51:20", "remaining_time": "14:10:51"} +{"current_steps": 2300, "total_steps": 4476, "loss": 0.388, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3929967550151568e-05, "epoch": 1.54, "percentage": 51.39, "elapsed_time": "14:55:12", "remaining_time": "14:06:57"} +{"current_steps": 2310, "total_steps": 4476, "loss": 0.3868, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.375468738602171e-05, "epoch": 1.55, "percentage": 51.61, "elapsed_time": "14:59:05", "remaining_time": "14:03:02"} +{"current_steps": 2320, "total_steps": 4476, "loss": 0.3769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3579468569296464e-05, "epoch": 1.55, "percentage": 51.83, "elapsed_time": "15:02:58", "remaining_time": "13:59:08"} +{"current_steps": 2330, "total_steps": 4476, "loss": 0.3811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.340431973171978e-05, "epoch": 1.56, "percentage": 52.06, "elapsed_time": "15:06:52", "remaining_time": "13:55:15"} +{"current_steps": 2340, "total_steps": 4476, "loss": 0.3805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3229249501588278e-05, "epoch": 1.57, "percentage": 52.28, "elapsed_time": "15:10:45", "remaining_time": "13:51:21"} +{"current_steps": 2350, "total_steps": 4476, "loss": 0.3822, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3054266503326165e-05, "epoch": 1.57, "percentage": 52.5, "elapsed_time": "15:14:39", "remaining_time": "13:47:28"} +{"current_steps": 2360, "total_steps": 4476, "loss": 0.3875, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2879379357060345e-05, "epoch": 1.58, "percentage": 52.73, "elapsed_time": "15:18:32", "remaining_time": "13:43:34"} +{"current_steps": 2370, "total_steps": 4476, "loss": 0.3884, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2704596678195827e-05, "epoch": 1.59, "percentage": 52.95, "elapsed_time": "15:22:26", "remaining_time": "13:39:40"} +{"current_steps": 2380, "total_steps": 4476, "loss": 0.381, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2529927076991283e-05, "epoch": 1.59, "percentage": 53.17, "elapsed_time": "15:26:20", "remaining_time": "13:35:47"} +{"current_steps": 2390, "total_steps": 4476, "loss": 0.3933, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2355379158134843e-05, "epoch": 1.6, "percentage": 53.4, "elapsed_time": "15:30:12", "remaining_time": "13:31:53"} +{"current_steps": 2400, "total_steps": 4476, "loss": 0.3853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2180961520320278e-05, "epoch": 1.61, "percentage": 53.62, "elapsed_time": "15:34:04", "remaining_time": "13:27:58"} +{"current_steps": 2410, "total_steps": 4476, "loss": 0.3871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2006682755823367e-05, "epoch": 1.62, "percentage": 53.84, "elapsed_time": "15:37:58", "remaining_time": "13:24:05"} +{"current_steps": 2420, "total_steps": 4476, "loss": 0.3867, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1832551450078594e-05, "epoch": 1.62, "percentage": 54.07, "elapsed_time": "15:41:51", "remaining_time": "13:20:11"} +{"current_steps": 2430, "total_steps": 4476, "loss": 0.3878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.165857618125625e-05, "epoch": 1.63, "percentage": 54.29, "elapsed_time": "15:45:45", "remaining_time": "13:16:17"} +{"current_steps": 2440, "total_steps": 4476, "loss": 0.386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1484765519839843e-05, "epoch": 1.64, "percentage": 54.51, "elapsed_time": "15:49:38", "remaining_time": "13:12:24"} +{"current_steps": 2450, "total_steps": 4476, "loss": 0.3836, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1311128028203863e-05, "epoch": 1.64, "percentage": 54.74, "elapsed_time": "15:53:33", "remaining_time": "13:08:31"} +{"current_steps": 2460, "total_steps": 4476, "loss": 0.3849, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1137672260192004e-05, "epoch": 1.65, "percentage": 54.96, "elapsed_time": "15:57:27", "remaining_time": "13:04:38"} +{"current_steps": 2470, "total_steps": 4476, "loss": 0.3841, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.09644067606958e-05, "epoch": 1.66, "percentage": 55.18, "elapsed_time": "16:01:22", "remaining_time": "13:00:46"} +{"current_steps": 2480, "total_steps": 4476, "loss": 0.3848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.079134006523359e-05, "epoch": 1.66, "percentage": 55.41, "elapsed_time": "16:05:16", "remaining_time": "12:56:53"} +{"current_steps": 2490, "total_steps": 4476, "loss": 0.3817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.061848069953017e-05, "epoch": 1.67, "percentage": 55.63, "elapsed_time": "16:09:10", "remaining_time": "12:53:00"} +{"current_steps": 2500, "total_steps": 4476, "loss": 0.3839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.044583717909667e-05, "epoch": 1.68, "percentage": 55.85, "elapsed_time": "16:13:04", "remaining_time": "12:49:06"} +{"current_steps": 2510, "total_steps": 4476, "loss": 0.3749, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0273418008811125e-05, "epoch": 1.68, "percentage": 56.08, "elapsed_time": "16:16:56", "remaining_time": "12:45:12"} +{"current_steps": 2520, "total_steps": 4476, "loss": 0.3824, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0101231682499506e-05, "epoch": 1.69, "percentage": 56.3, "elapsed_time": "16:20:49", "remaining_time": "12:41:18"} +{"current_steps": 2530, "total_steps": 4476, "loss": 0.3854, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9929286682517235e-05, "epoch": 1.7, "percentage": 56.52, "elapsed_time": "16:24:42", "remaining_time": "12:37:24"} +{"current_steps": 2540, "total_steps": 4476, "loss": 0.3894, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9757591479331382e-05, "epoch": 1.7, "percentage": 56.75, "elapsed_time": "16:28:37", "remaining_time": "12:33:31"} +{"current_steps": 2550, "total_steps": 4476, "loss": 0.3871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9586154531103373e-05, "epoch": 1.71, "percentage": 56.97, "elapsed_time": "16:32:31", "remaining_time": "12:29:38"} +{"current_steps": 2560, "total_steps": 4476, "loss": 0.382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9414984283272286e-05, "epoch": 1.72, "percentage": 57.19, "elapsed_time": "16:36:23", "remaining_time": "12:25:44"} +{"current_steps": 2570, "total_steps": 4476, "loss": 0.3858, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9244089168138836e-05, "epoch": 1.72, "percentage": 57.42, "elapsed_time": "16:40:16", "remaining_time": "12:21:50"} +{"current_steps": 2580, "total_steps": 4476, "loss": 0.3819, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9073477604449985e-05, "epoch": 1.73, "percentage": 57.64, "elapsed_time": "16:44:09", "remaining_time": "12:17:56"} +{"current_steps": 2590, "total_steps": 4476, "loss": 0.3795, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8903157996984174e-05, "epoch": 1.74, "percentage": 57.86, "elapsed_time": "16:48:03", "remaining_time": "12:14:03"} +{"current_steps": 2600, "total_steps": 4476, "loss": 0.3827, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.873313873613733e-05, "epoch": 1.74, "percentage": 58.09, "elapsed_time": "16:51:57", "remaining_time": "12:10:10"} +{"current_steps": 2610, "total_steps": 4476, "loss": 0.385, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8563428197509502e-05, "epoch": 1.75, "percentage": 58.31, "elapsed_time": "16:55:51", "remaining_time": "12:06:16"} +{"current_steps": 2620, "total_steps": 4476, "loss": 0.3846, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.839403474149225e-05, "epoch": 1.76, "percentage": 58.53, "elapsed_time": "16:59:44", "remaining_time": "12:02:22"} +{"current_steps": 2630, "total_steps": 4476, "loss": 0.3828, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8224966712856806e-05, "epoch": 1.76, "percentage": 58.76, "elapsed_time": "17:03:37", "remaining_time": "11:58:29"} +{"current_steps": 2640, "total_steps": 4476, "loss": 0.3792, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8056232440343013e-05, "epoch": 1.77, "percentage": 58.98, "elapsed_time": "17:07:29", "remaining_time": "11:54:34"} +{"current_steps": 2650, "total_steps": 4476, "loss": 0.3814, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.788784023624896e-05, "epoch": 1.78, "percentage": 59.2, "elapsed_time": "17:11:22", "remaining_time": "11:50:40"} +{"current_steps": 2660, "total_steps": 4476, "loss": 0.3819, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7719798396021558e-05, "epoch": 1.78, "percentage": 59.43, "elapsed_time": "17:15:15", "remaining_time": "11:46:46"} +{"current_steps": 2670, "total_steps": 4476, "loss": 0.3798, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7552115197847884e-05, "epoch": 1.79, "percentage": 59.65, "elapsed_time": "17:19:10", "remaining_time": "11:42:53"} +{"current_steps": 2680, "total_steps": 4476, "loss": 0.3772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7384798902247316e-05, "epoch": 1.8, "percentage": 59.87, "elapsed_time": "17:23:03", "remaining_time": "11:39:00"} +{"current_steps": 2690, "total_steps": 4476, "loss": 0.3758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7217857751664663e-05, "epoch": 1.8, "percentage": 60.1, "elapsed_time": "17:26:56", "remaining_time": "11:35:06"} +{"current_steps": 2700, "total_steps": 4476, "loss": 0.3706, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7051299970064098e-05, "epoch": 1.81, "percentage": 60.32, "elapsed_time": "17:30:51", "remaining_time": "11:31:13"} +{"current_steps": 2710, "total_steps": 4476, "loss": 0.3805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6885133762523985e-05, "epoch": 1.82, "percentage": 60.55, "elapsed_time": "17:34:45", "remaining_time": "11:27:20"} +{"current_steps": 2720, "total_steps": 4476, "loss": 0.3892, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6719367314832756e-05, "epoch": 1.82, "percentage": 60.77, "elapsed_time": "17:38:39", "remaining_time": "11:23:27"} +{"current_steps": 2730, "total_steps": 4476, "loss": 0.387, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.65540087930856e-05, "epoch": 1.83, "percentage": 60.99, "elapsed_time": "17:42:33", "remaining_time": "11:19:34"} +{"current_steps": 2740, "total_steps": 4476, "loss": 0.3773, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6389066343282168e-05, "epoch": 1.84, "percentage": 61.22, "elapsed_time": "17:46:26", "remaining_time": "11:15:40"} +{"current_steps": 2750, "total_steps": 4476, "loss": 0.3829, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6224548090925323e-05, "epoch": 1.84, "percentage": 61.44, "elapsed_time": "17:50:20", "remaining_time": "11:11:46"} +{"current_steps": 2760, "total_steps": 4476, "loss": 0.3697, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6060462140620835e-05, "epoch": 1.85, "percentage": 61.66, "elapsed_time": "17:54:11", "remaining_time": "11:07:52"} +{"current_steps": 2770, "total_steps": 4476, "loss": 0.3817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.589681657567811e-05, "epoch": 1.86, "percentage": 61.89, "elapsed_time": "17:58:04", "remaining_time": "11:03:58"} +{"current_steps": 2780, "total_steps": 4476, "loss": 0.3819, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5733619457712037e-05, "epoch": 1.86, "percentage": 62.11, "elapsed_time": "18:01:58", "remaining_time": "11:00:04"} +{"current_steps": 2790, "total_steps": 4476, "loss": 0.3756, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5570878826245773e-05, "epoch": 1.87, "percentage": 62.33, "elapsed_time": "18:05:52", "remaining_time": "10:56:11"} +{"current_steps": 2800, "total_steps": 4476, "loss": 0.3791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5408602698314777e-05, "epoch": 1.88, "percentage": 62.56, "elapsed_time": "18:09:44", "remaining_time": "10:52:17"} +{"current_steps": 2810, "total_steps": 4476, "loss": 0.3765, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5246799068071818e-05, "epoch": 1.88, "percentage": 62.78, "elapsed_time": "18:13:37", "remaining_time": "10:48:23"} +{"current_steps": 2820, "total_steps": 4476, "loss": 0.3834, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5085475906393153e-05, "epoch": 1.89, "percentage": 63.0, "elapsed_time": "18:17:31", "remaining_time": "10:44:29"} +{"current_steps": 2830, "total_steps": 4476, "loss": 0.381, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4924641160485923e-05, "epoch": 1.9, "percentage": 63.23, "elapsed_time": "18:21:24", "remaining_time": "10:40:36"} +{"current_steps": 2840, "total_steps": 4476, "loss": 0.3766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4764302753496584e-05, "epoch": 1.9, "percentage": 63.45, "elapsed_time": "18:25:17", "remaining_time": "10:36:42"} +{"current_steps": 2850, "total_steps": 4476, "loss": 0.3815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4604468584120607e-05, "epoch": 1.91, "percentage": 63.67, "elapsed_time": "18:29:09", "remaining_time": "10:32:48"} +{"current_steps": 2860, "total_steps": 4476, "loss": 0.3774, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4445146526213415e-05, "epoch": 1.92, "percentage": 63.9, "elapsed_time": "18:33:01", "remaining_time": "10:28:53"} +{"current_steps": 2870, "total_steps": 4476, "loss": 0.3879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4286344428402454e-05, "epoch": 1.92, "percentage": 64.12, "elapsed_time": "18:36:54", "remaining_time": "10:24:59"} +{"current_steps": 2880, "total_steps": 4476, "loss": 0.3777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.412807011370052e-05, "epoch": 1.93, "percentage": 64.34, "elapsed_time": "18:40:47", "remaining_time": "10:21:06"} +{"current_steps": 2890, "total_steps": 4476, "loss": 0.3806, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3970331379120455e-05, "epoch": 1.94, "percentage": 64.57, "elapsed_time": "18:44:41", "remaining_time": "10:17:12"} +{"current_steps": 2900, "total_steps": 4476, "loss": 0.3848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3813135995290988e-05, "epoch": 1.94, "percentage": 64.79, "elapsed_time": "18:48:34", "remaining_time": "10:13:19"} +{"current_steps": 2910, "total_steps": 4476, "loss": 0.3745, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3656491706073935e-05, "epoch": 1.95, "percentage": 65.01, "elapsed_time": "18:52:27", "remaining_time": "10:09:25"} +{"current_steps": 2920, "total_steps": 4476, "loss": 0.377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.350040622818275e-05, "epoch": 1.96, "percentage": 65.24, "elapsed_time": "18:56:21", "remaining_time": "10:05:32"} +{"current_steps": 2930, "total_steps": 4476, "loss": 0.3783, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3344887250802345e-05, "epoch": 1.96, "percentage": 65.46, "elapsed_time": "19:00:14", "remaining_time": "10:01:38"} +{"current_steps": 2940, "total_steps": 4476, "loss": 0.3768, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3189942435210301e-05, "epoch": 1.97, "percentage": 65.68, "elapsed_time": "19:04:08", "remaining_time": "9:57:45"} +{"current_steps": 2950, "total_steps": 4476, "loss": 0.3744, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.303557941439949e-05, "epoch": 1.98, "percentage": 65.91, "elapsed_time": "19:08:02", "remaining_time": "9:53:52"} +{"current_steps": 2960, "total_steps": 4476, "loss": 0.3788, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2881805792702031e-05, "epoch": 1.98, "percentage": 66.13, "elapsed_time": "19:11:56", "remaining_time": "9:49:58"} +{"current_steps": 2970, "total_steps": 4476, "loss": 0.3735, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2728629145414645e-05, "epoch": 1.99, "percentage": 66.35, "elapsed_time": "19:15:50", "remaining_time": "9:46:05"} +{"current_steps": 2980, "total_steps": 4476, "loss": 0.3819, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.257605701842554e-05, "epoch": 2.0, "percentage": 66.58, "elapsed_time": "19:19:43", "remaining_time": "9:42:11"} +{"current_steps": 2990, "total_steps": 4476, "loss": 0.3812, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.242409692784265e-05, "epoch": 2.0, "percentage": 66.8, "elapsed_time": "19:23:36", "remaining_time": "9:38:17"} +{"current_steps": 3000, "total_steps": 4476, "loss": 0.3769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2272756359623342e-05, "epoch": 2.01, "percentage": 67.02, "elapsed_time": "19:27:29", "remaining_time": "9:34:24"} +{"current_steps": 3010, "total_steps": 4476, "loss": 0.3779, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2122042769205702e-05, "epoch": 2.02, "percentage": 67.25, "elapsed_time": "19:31:22", "remaining_time": "9:30:30"} +{"current_steps": 3020, "total_steps": 4476, "loss": 0.3817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1971963581141196e-05, "epoch": 2.02, "percentage": 67.47, "elapsed_time": "19:35:16", "remaining_time": "9:26:37"} +{"current_steps": 3030, "total_steps": 4476, "loss": 0.3788, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1822526188728966e-05, "epoch": 2.03, "percentage": 67.69, "elapsed_time": "19:39:09", "remaining_time": "9:22:43"} +{"current_steps": 3040, "total_steps": 4476, "loss": 0.3776, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1673737953651601e-05, "epoch": 2.04, "percentage": 67.92, "elapsed_time": "19:43:03", "remaining_time": "9:18:50"} +{"current_steps": 3050, "total_steps": 4476, "loss": 0.3785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1525606205612447e-05, "epoch": 2.04, "percentage": 68.14, "elapsed_time": "19:46:58", "remaining_time": "9:14:57"} +{"current_steps": 3060, "total_steps": 4476, "loss": 0.3858, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1378138241974595e-05, "epoch": 2.05, "percentage": 68.36, "elapsed_time": "19:50:52", "remaining_time": "9:11:04"} +{"current_steps": 3070, "total_steps": 4476, "loss": 0.3766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1231341327401323e-05, "epoch": 2.06, "percentage": 68.59, "elapsed_time": "19:54:46", "remaining_time": "9:07:10"} +{"current_steps": 3080, "total_steps": 4476, "loss": 0.3766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1085222693498256e-05, "epoch": 2.06, "percentage": 68.81, "elapsed_time": "19:58:39", "remaining_time": "9:03:17"} +{"current_steps": 3090, "total_steps": 4476, "loss": 0.3795, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.093978953845713e-05, "epoch": 2.07, "percentage": 69.03, "elapsed_time": "20:02:32", "remaining_time": "8:59:23"} +{"current_steps": 3100, "total_steps": 4476, "loss": 0.3837, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.079504902670117e-05, "epoch": 2.08, "percentage": 69.26, "elapsed_time": "20:06:25", "remaining_time": "8:55:29"} +{"current_steps": 3110, "total_steps": 4476, "loss": 0.377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.065100828853213e-05, "epoch": 2.08, "percentage": 69.48, "elapsed_time": "20:10:18", "remaining_time": "8:51:36"} +{"current_steps": 3120, "total_steps": 4476, "loss": 0.3759, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0507674419779085e-05, "epoch": 2.09, "percentage": 69.71, "elapsed_time": "20:14:12", "remaining_time": "8:47:42"} +{"current_steps": 3130, "total_steps": 4476, "loss": 0.3704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0365054481448849e-05, "epoch": 2.1, "percentage": 69.93, "elapsed_time": "20:18:05", "remaining_time": "8:43:48"} +{"current_steps": 3140, "total_steps": 4476, "loss": 0.3751, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.02231554993781e-05, "epoch": 2.1, "percentage": 70.15, "elapsed_time": "20:21:59", "remaining_time": "8:39:55"} +{"current_steps": 3150, "total_steps": 4476, "loss": 0.396, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0081984463887325e-05, "epoch": 2.11, "percentage": 70.38, "elapsed_time": "20:25:53", "remaining_time": "8:36:02"} +{"current_steps": 3160, "total_steps": 4476, "loss": 0.3788, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.941548329436425e-06, "epoch": 2.12, "percentage": 70.6, "elapsed_time": "20:29:46", "remaining_time": "8:32:08"} +{"current_steps": 3170, "total_steps": 4476, "loss": 0.3767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.801854014282108e-06, "epoch": 2.12, "percentage": 70.82, "elapsed_time": "20:33:39", "remaining_time": "8:28:15"} +{"current_steps": 3180, "total_steps": 4476, "loss": 0.3783, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.662908400137125e-06, "epoch": 2.13, "percentage": 71.05, "elapsed_time": "20:37:32", "remaining_time": "8:24:21"} +{"current_steps": 3190, "total_steps": 4476, "loss": 0.3775, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.524718331831186e-06, "epoch": 2.14, "percentage": 71.27, "elapsed_time": "20:41:25", "remaining_time": "8:20:27"} +{"current_steps": 3200, "total_steps": 4476, "loss": 0.3789, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.387290616973859e-06, "epoch": 2.14, "percentage": 71.49, "elapsed_time": "20:45:19", "remaining_time": "8:16:34"} +{"current_steps": 3210, "total_steps": 4476, "loss": 0.3776, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.250632025619104e-06, "epoch": 2.15, "percentage": 71.72, "elapsed_time": "20:49:13", "remaining_time": "8:12:41"} +{"current_steps": 3220, "total_steps": 4476, "loss": 0.368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.11474928993187e-06, "epoch": 2.16, "percentage": 71.94, "elapsed_time": "20:53:06", "remaining_time": "8:08:47"} +{"current_steps": 3230, "total_steps": 4476, "loss": 0.378, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.979649103856345e-06, "epoch": 2.16, "percentage": 72.16, "elapsed_time": "20:56:59", "remaining_time": "8:04:53"} +{"current_steps": 3240, "total_steps": 4476, "loss": 0.3776, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.84533812278629e-06, "epoch": 2.17, "percentage": 72.39, "elapsed_time": "21:00:53", "remaining_time": "8:01:00"} +{"current_steps": 3250, "total_steps": 4476, "loss": 0.3731, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.711822963237093e-06, "epoch": 2.18, "percentage": 72.61, "elapsed_time": "21:04:47", "remaining_time": "7:57:07"} +{"current_steps": 3260, "total_steps": 4476, "loss": 0.3827, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.579110202519894e-06, "epoch": 2.18, "percentage": 72.83, "elapsed_time": "21:08:40", "remaining_time": "7:53:13"} +{"current_steps": 3270, "total_steps": 4476, "loss": 0.3725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.447206378417533e-06, "epoch": 2.19, "percentage": 73.06, "elapsed_time": "21:12:34", "remaining_time": "7:49:19"} +{"current_steps": 3280, "total_steps": 4476, "loss": 0.372, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.31611798886246e-06, "epoch": 2.2, "percentage": 73.28, "elapsed_time": "21:16:27", "remaining_time": "7:45:26"} +{"current_steps": 3290, "total_steps": 4476, "loss": 0.3753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.185851491616677e-06, "epoch": 2.2, "percentage": 73.5, "elapsed_time": "21:20:21", "remaining_time": "7:41:33"} +{"current_steps": 3300, "total_steps": 4476, "loss": 0.3732, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.0564133039536e-06, "epoch": 2.21, "percentage": 73.73, "elapsed_time": "21:24:16", "remaining_time": "7:37:40"} +{"current_steps": 3310, "total_steps": 4476, "loss": 0.37, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.927809802341876e-06, "epoch": 2.22, "percentage": 73.95, "elapsed_time": "21:28:09", "remaining_time": "7:33:46"} +{"current_steps": 3320, "total_steps": 4476, "loss": 0.372, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.800047322131346e-06, "epoch": 2.22, "percentage": 74.17, "elapsed_time": "21:32:03", "remaining_time": "7:29:53"} +{"current_steps": 3330, "total_steps": 4476, "loss": 0.3734, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.673132157240877e-06, "epoch": 2.23, "percentage": 74.4, "elapsed_time": "21:35:56", "remaining_time": "7:25:59"} +{"current_steps": 3340, "total_steps": 4476, "loss": 0.3734, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.5470705598483405e-06, "epoch": 2.24, "percentage": 74.62, "elapsed_time": "21:39:51", "remaining_time": "7:22:06"} +{"current_steps": 3350, "total_steps": 4476, "loss": 0.3784, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.4218687400826075e-06, "epoch": 2.24, "percentage": 74.84, "elapsed_time": "21:43:45", "remaining_time": "7:18:13"} +{"current_steps": 3360, "total_steps": 4476, "loss": 0.3715, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.297532865717638e-06, "epoch": 2.25, "percentage": 75.07, "elapsed_time": "21:47:40", "remaining_time": "7:14:19"} +{"current_steps": 3370, "total_steps": 4476, "loss": 0.3836, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.174069061868591e-06, "epoch": 2.26, "percentage": 75.29, "elapsed_time": "21:51:33", "remaining_time": "7:10:26"} +{"current_steps": 3380, "total_steps": 4476, "loss": 0.3784, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.05148341069014e-06, "epoch": 2.27, "percentage": 75.51, "elapsed_time": "21:55:26", "remaining_time": "7:06:32"} +{"current_steps": 3390, "total_steps": 4476, "loss": 0.3737, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.929781951076836e-06, "epoch": 2.27, "percentage": 75.74, "elapsed_time": "21:59:19", "remaining_time": "7:02:39"} +{"current_steps": 3400, "total_steps": 4476, "loss": 0.3827, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.80897067836557e-06, "epoch": 2.28, "percentage": 75.96, "elapsed_time": "22:03:13", "remaining_time": "6:58:45"} +{"current_steps": 3410, "total_steps": 4476, "loss": 0.3808, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.6890555440403015e-06, "epoch": 2.29, "percentage": 76.18, "elapsed_time": "22:07:08", "remaining_time": "6:54:52"} +{"current_steps": 3420, "total_steps": 4476, "loss": 0.3797, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.570042455438822e-06, "epoch": 2.29, "percentage": 76.41, "elapsed_time": "22:11:02", "remaining_time": "6:50:59"} +{"current_steps": 3430, "total_steps": 4476, "loss": 0.3739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.451937275461736e-06, "epoch": 2.3, "percentage": 76.63, "elapsed_time": "22:14:56", "remaining_time": "6:47:05"} +{"current_steps": 3440, "total_steps": 4476, "loss": 0.3748, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.334745822283699e-06, "epoch": 2.31, "percentage": 76.85, "elapsed_time": "22:18:50", "remaining_time": "6:43:12"} +{"current_steps": 3450, "total_steps": 4476, "loss": 0.3769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.2184738690667214e-06, "epoch": 2.31, "percentage": 77.08, "elapsed_time": "22:22:43", "remaining_time": "6:39:18"} +{"current_steps": 3460, "total_steps": 4476, "loss": 0.3756, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.103127143675832e-06, "epoch": 2.32, "percentage": 77.3, "elapsed_time": "22:26:37", "remaining_time": "6:35:25"} +{"current_steps": 3470, "total_steps": 4476, "loss": 0.3738, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.988711328396859e-06, "epoch": 2.33, "percentage": 77.52, "elapsed_time": "22:30:30", "remaining_time": "6:31:31"} +{"current_steps": 3480, "total_steps": 4476, "loss": 0.3676, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.875232059656552e-06, "epoch": 2.33, "percentage": 77.75, "elapsed_time": "22:34:23", "remaining_time": "6:27:38"} +{"current_steps": 3490, "total_steps": 4476, "loss": 0.3737, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.762694927744866e-06, "epoch": 2.34, "percentage": 77.97, "elapsed_time": "22:38:16", "remaining_time": "6:23:44"} +{"current_steps": 3500, "total_steps": 4476, "loss": 0.369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.651105476539623e-06, "epoch": 2.35, "percentage": 78.19, "elapsed_time": "22:42:09", "remaining_time": "6:19:50"} +{"current_steps": 3510, "total_steps": 4476, "loss": 0.3723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.540469203233347e-06, "epoch": 2.35, "percentage": 78.42, "elapsed_time": "22:46:03", "remaining_time": "6:15:57"} +{"current_steps": 3520, "total_steps": 4476, "loss": 0.3791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.430791558062518e-06, "epoch": 2.36, "percentage": 78.64, "elapsed_time": "22:49:57", "remaining_time": "6:12:03"} +{"current_steps": 3530, "total_steps": 4476, "loss": 0.3753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.322077944039039e-06, "epoch": 2.37, "percentage": 78.87, "elapsed_time": "22:53:50", "remaining_time": "6:08:10"} +{"current_steps": 3540, "total_steps": 4476, "loss": 0.3703, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.21433371668407e-06, "epoch": 2.37, "percentage": 79.09, "elapsed_time": "22:57:43", "remaining_time": "6:04:16"} +{"current_steps": 3550, "total_steps": 4476, "loss": 0.3781, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.107564183764219e-06, "epoch": 2.38, "percentage": 79.31, "elapsed_time": "23:01:37", "remaining_time": "6:00:23"} +{"current_steps": 3560, "total_steps": 4476, "loss": 0.3766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.001774605030074e-06, "epoch": 2.39, "percentage": 79.54, "elapsed_time": "23:05:30", "remaining_time": "5:56:29"} +{"current_steps": 3570, "total_steps": 4476, "loss": 0.38, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8969701919570454e-06, "epoch": 2.39, "percentage": 79.76, "elapsed_time": "23:09:23", "remaining_time": "5:52:36"} +{"current_steps": 3580, "total_steps": 4476, "loss": 0.3681, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7931561074887e-06, "epoch": 2.4, "percentage": 79.98, "elapsed_time": "23:13:16", "remaining_time": "5:48:42"} +{"current_steps": 3590, "total_steps": 4476, "loss": 0.3752, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.690337465782366e-06, "epoch": 2.41, "percentage": 80.21, "elapsed_time": "23:17:08", "remaining_time": "5:44:48"} +{"current_steps": 3600, "total_steps": 4476, "loss": 0.3775, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.588519331957241e-06, "epoch": 2.41, "percentage": 80.43, "elapsed_time": "23:21:02", "remaining_time": "5:40:55"} +{"current_steps": 3610, "total_steps": 4476, "loss": 0.3677, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4877067218448285e-06, "epoch": 2.42, "percentage": 80.65, "elapsed_time": "23:24:56", "remaining_time": "5:37:01"} +{"current_steps": 3620, "total_steps": 4476, "loss": 0.3718, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.38790460174188e-06, "epoch": 2.43, "percentage": 80.88, "elapsed_time": "23:28:47", "remaining_time": "5:33:07"} +{"current_steps": 3630, "total_steps": 4476, "loss": 0.3671, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.289117888165708e-06, "epoch": 2.43, "percentage": 81.1, "elapsed_time": "23:32:41", "remaining_time": "5:29:14"} +{"current_steps": 3640, "total_steps": 4476, "loss": 0.3728, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.191351447612032e-06, "epoch": 2.44, "percentage": 81.32, "elapsed_time": "23:36:33", "remaining_time": "5:25:20"} +{"current_steps": 3650, "total_steps": 4476, "loss": 0.3769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.094610096315199e-06, "epoch": 2.45, "percentage": 81.55, "elapsed_time": "23:40:27", "remaining_time": "5:21:27"} +{"current_steps": 3660, "total_steps": 4476, "loss": 0.3777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.998898600010928e-06, "epoch": 2.45, "percentage": 81.77, "elapsed_time": "23:44:20", "remaining_time": "5:17:33"} +{"current_steps": 3670, "total_steps": 4476, "loss": 0.3817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.904221673701566e-06, "epoch": 2.46, "percentage": 81.99, "elapsed_time": "23:48:14", "remaining_time": "5:13:40"} +{"current_steps": 3680, "total_steps": 4476, "loss": 0.383, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.810583981423796e-06, "epoch": 2.47, "percentage": 82.22, "elapsed_time": "23:52:07", "remaining_time": "5:09:46"} +{"current_steps": 3690, "total_steps": 4476, "loss": 0.3719, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7179901360188533e-06, "epoch": 2.47, "percentage": 82.44, "elapsed_time": "23:56:01", "remaining_time": "5:05:53"} +{"current_steps": 3700, "total_steps": 4476, "loss": 0.3716, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.626444698905329e-06, "epoch": 2.48, "percentage": 82.66, "elapsed_time": "23:59:55", "remaining_time": "5:01:59"} +{"current_steps": 3710, "total_steps": 4476, "loss": 0.3736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5359521798544347e-06, "epoch": 2.49, "percentage": 82.89, "elapsed_time": "1 day, 0:03:48", "remaining_time": "4:58:06"} +{"current_steps": 3720, "total_steps": 4476, "loss": 0.3741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4465170367678294e-06, "epoch": 2.49, "percentage": 83.11, "elapsed_time": "1 day, 0:07:41", "remaining_time": "4:54:12"} +{"current_steps": 3730, "total_steps": 4476, "loss": 0.3756, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3581436754580363e-06, "epoch": 2.5, "percentage": 83.33, "elapsed_time": "1 day, 0:11:35", "remaining_time": "4:50:19"} +{"current_steps": 3740, "total_steps": 4476, "loss": 0.3777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.270836449431397e-06, "epoch": 2.51, "percentage": 83.56, "elapsed_time": "1 day, 0:15:28", "remaining_time": "4:46:25"} +{"current_steps": 3750, "total_steps": 4476, "loss": 0.3774, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.184599659673579e-06, "epoch": 2.51, "percentage": 83.78, "elapsed_time": "1 day, 0:19:22", "remaining_time": "4:42:32"} +{"current_steps": 3760, "total_steps": 4476, "loss": 0.3785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0994375544377424e-06, "epoch": 2.52, "percentage": 84.0, "elapsed_time": "1 day, 0:23:17", "remaining_time": "4:38:38"} +{"current_steps": 3770, "total_steps": 4476, "loss": 0.3768, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0153543290352164e-06, "epoch": 2.53, "percentage": 84.23, "elapsed_time": "1 day, 0:27:11", "remaining_time": "4:34:45"} +{"current_steps": 3780, "total_steps": 4476, "loss": 0.377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.932354125628853e-06, "epoch": 2.53, "percentage": 84.45, "elapsed_time": "1 day, 0:31:05", "remaining_time": "4:30:52"} +{"current_steps": 3790, "total_steps": 4476, "loss": 0.3803, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8504410330289778e-06, "epoch": 2.54, "percentage": 84.67, "elapsed_time": "1 day, 0:34:58", "remaining_time": "4:26:58"} +{"current_steps": 3800, "total_steps": 4476, "loss": 0.3706, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.769619086491923e-06, "epoch": 2.55, "percentage": 84.9, "elapsed_time": "1 day, 0:38:51", "remaining_time": "4:23:04"} +{"current_steps": 3810, "total_steps": 4476, "loss": 0.3712, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6898922675213016e-06, "epoch": 2.55, "percentage": 85.12, "elapsed_time": "1 day, 0:42:45", "remaining_time": "4:19:11"} +{"current_steps": 3820, "total_steps": 4476, "loss": 0.369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.611264503671823e-06, "epoch": 2.56, "percentage": 85.34, "elapsed_time": "1 day, 0:46:38", "remaining_time": "4:15:17"} +{"current_steps": 3830, "total_steps": 4476, "loss": 0.3726, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.533739668355814e-06, "epoch": 2.57, "percentage": 85.57, "elapsed_time": "1 day, 0:50:31", "remaining_time": "4:11:24"} +{"current_steps": 3840, "total_steps": 4476, "loss": 0.3746, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.45732158065243e-06, "epoch": 2.57, "percentage": 85.79, "elapsed_time": "1 day, 0:54:25", "remaining_time": "4:07:30"} +{"current_steps": 3850, "total_steps": 4476, "loss": 0.3701, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.382014005119501e-06, "epoch": 2.58, "percentage": 86.01, "elapsed_time": "1 day, 0:58:18", "remaining_time": "4:03:37"} +{"current_steps": 3860, "total_steps": 4476, "loss": 0.38, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3078206516080695e-06, "epoch": 2.59, "percentage": 86.24, "elapsed_time": "1 day, 1:02:11", "remaining_time": "3:59:43"} +{"current_steps": 3870, "total_steps": 4476, "loss": 0.3725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2347451750796474e-06, "epoch": 2.59, "percentage": 86.46, "elapsed_time": "1 day, 1:06:05", "remaining_time": "3:55:50"} +{"current_steps": 3880, "total_steps": 4476, "loss": 0.3825, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1627911754261653e-06, "epoch": 2.6, "percentage": 86.68, "elapsed_time": "1 day, 1:09:58", "remaining_time": "3:51:56"} +{"current_steps": 3890, "total_steps": 4476, "loss": 0.3791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0919621972926156e-06, "epoch": 2.61, "percentage": 86.91, "elapsed_time": "1 day, 1:13:51", "remaining_time": "3:48:03"} +{"current_steps": 3900, "total_steps": 4476, "loss": 0.3778, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.022261729902458e-06, "epoch": 2.61, "percentage": 87.13, "elapsed_time": "1 day, 1:17:44", "remaining_time": "3:44:09"} +{"current_steps": 3910, "total_steps": 4476, "loss": 0.3735, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.953693206885715e-06, "epoch": 2.62, "percentage": 87.35, "elapsed_time": "1 day, 1:21:38", "remaining_time": "3:40:16"} +{"current_steps": 3920, "total_steps": 4476, "loss": 0.3816, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8862600061098106e-06, "epoch": 2.63, "percentage": 87.58, "elapsed_time": "1 day, 1:25:31", "remaining_time": "3:36:22"} +{"current_steps": 3930, "total_steps": 4476, "loss": 0.3752, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8199654495131974e-06, "epoch": 2.63, "percentage": 87.8, "elapsed_time": "1 day, 1:29:24", "remaining_time": "3:32:28"} +{"current_steps": 3940, "total_steps": 4476, "loss": 0.3739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.754812802941691e-06, "epoch": 2.64, "percentage": 88.03, "elapsed_time": "1 day, 1:33:18", "remaining_time": "3:28:35"} +{"current_steps": 3950, "total_steps": 4476, "loss": 0.3745, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6908052759875836e-06, "epoch": 2.65, "percentage": 88.25, "elapsed_time": "1 day, 1:37:12", "remaining_time": "3:24:42"} +{"current_steps": 3960, "total_steps": 4476, "loss": 0.3753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6279460218315361e-06, "epoch": 2.65, "percentage": 88.47, "elapsed_time": "1 day, 1:41:06", "remaining_time": "3:20:48"} +{"current_steps": 3970, "total_steps": 4476, "loss": 0.3736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5662381370872532e-06, "epoch": 2.66, "percentage": 88.7, "elapsed_time": "1 day, 1:45:01", "remaining_time": "3:16:55"} +{"current_steps": 3980, "total_steps": 4476, "loss": 0.3755, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5056846616489124e-06, "epoch": 2.67, "percentage": 88.92, "elapsed_time": "1 day, 1:48:56", "remaining_time": "3:13:01"} +{"current_steps": 3990, "total_steps": 4476, "loss": 0.3741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4462885785414327e-06, "epoch": 2.67, "percentage": 89.14, "elapsed_time": "1 day, 1:52:50", "remaining_time": "3:09:08"} +{"current_steps": 4000, "total_steps": 4476, "loss": 0.3708, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3880528137735132e-06, "epoch": 2.68, "percentage": 89.37, "elapsed_time": "1 day, 1:56:42", "remaining_time": "3:05:14"} +{"current_steps": 4010, "total_steps": 4476, "loss": 0.3703, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3309802361934936e-06, "epoch": 2.69, "percentage": 89.59, "elapsed_time": "1 day, 2:00:35", "remaining_time": "3:01:21"} +{"current_steps": 4020, "total_steps": 4476, "loss": 0.3784, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2750736573480248e-06, "epoch": 2.69, "percentage": 89.81, "elapsed_time": "1 day, 2:04:29", "remaining_time": "2:57:27"} +{"current_steps": 4030, "total_steps": 4476, "loss": 0.3785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2203358313435609e-06, "epoch": 2.7, "percentage": 90.04, "elapsed_time": "1 day, 2:08:23", "remaining_time": "2:53:34"} +{"current_steps": 4040, "total_steps": 4476, "loss": 0.3832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1667694547106978e-06, "epoch": 2.71, "percentage": 90.26, "elapsed_time": "1 day, 2:12:16", "remaining_time": "2:49:40"} +{"current_steps": 4050, "total_steps": 4476, "loss": 0.3708, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1143771662713214e-06, "epoch": 2.71, "percentage": 90.48, "elapsed_time": "1 day, 2:16:10", "remaining_time": "2:45:47"} +{"current_steps": 4060, "total_steps": 4476, "loss": 0.3777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.063161547008612e-06, "epoch": 2.72, "percentage": 90.71, "elapsed_time": "1 day, 2:20:02", "remaining_time": "2:41:53"} +{"current_steps": 4070, "total_steps": 4476, "loss": 0.375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0131251199399089e-06, "epoch": 2.73, "percentage": 90.93, "elapsed_time": "1 day, 2:23:56", "remaining_time": "2:38:00"} +{"current_steps": 4080, "total_steps": 4476, "loss": 0.3719, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.642703499924216e-07, "epoch": 2.73, "percentage": 91.15, "elapsed_time": "1 day, 2:27:50", "remaining_time": "2:34:06"} +{"current_steps": 4090, "total_steps": 4476, "loss": 0.3776, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.16599643881777e-07, "epoch": 2.74, "percentage": 91.38, "elapsed_time": "1 day, 2:31:44", "remaining_time": "2:30:13"} +{"current_steps": 4100, "total_steps": 4476, "loss": 0.377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.701153499934833e-07, "epoch": 2.75, "percentage": 91.6, "elapsed_time": "1 day, 2:35:37", "remaining_time": "2:26:19"} +{"current_steps": 4110, "total_steps": 4476, "loss": 0.3759, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.248197582672395e-07, "epoch": 2.75, "percentage": 91.82, "elapsed_time": "1 day, 2:39:30", "remaining_time": "2:22:26"} +{"current_steps": 4120, "total_steps": 4476, "loss": 0.3727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.807151000841118e-07, "epoch": 2.76, "percentage": 92.05, "elapsed_time": "1 day, 2:43:24", "remaining_time": "2:18:32"} +{"current_steps": 4130, "total_steps": 4476, "loss": 0.374, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.378035481566181e-07, "epoch": 2.77, "percentage": 92.27, "elapsed_time": "1 day, 2:47:17", "remaining_time": "2:14:39"} +{"current_steps": 4140, "total_steps": 4476, "loss": 0.3792, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.960872164217064e-07, "epoch": 2.77, "percentage": 92.49, "elapsed_time": "1 day, 2:51:11", "remaining_time": "2:10:45"} +{"current_steps": 4150, "total_steps": 4476, "loss": 0.3692, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.555681599365926e-07, "epoch": 2.78, "percentage": 92.72, "elapsed_time": "1 day, 2:55:04", "remaining_time": "2:06:52"} +{"current_steps": 4160, "total_steps": 4476, "loss": 0.3736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.16248374777545e-07, "epoch": 2.79, "percentage": 92.94, "elapsed_time": "1 day, 2:58:57", "remaining_time": "2:02:58"} +{"current_steps": 4170, "total_steps": 4476, "loss": 0.3695, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.781297979415456e-07, "epoch": 2.79, "percentage": 93.16, "elapsed_time": "1 day, 3:02:51", "remaining_time": "1:59:05"} +{"current_steps": 4180, "total_steps": 4476, "loss": 0.3716, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.412143072508563e-07, "epoch": 2.8, "percentage": 93.39, "elapsed_time": "1 day, 3:06:44", "remaining_time": "1:55:11"} +{"current_steps": 4190, "total_steps": 4476, "loss": 0.364, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.055037212605279e-07, "epoch": 2.81, "percentage": 93.61, "elapsed_time": "1 day, 3:10:38", "remaining_time": "1:51:18"} +{"current_steps": 4200, "total_steps": 4476, "loss": 0.3707, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.709997991688114e-07, "epoch": 2.81, "percentage": 93.83, "elapsed_time": "1 day, 3:14:33", "remaining_time": "1:47:24"} +{"current_steps": 4210, "total_steps": 4476, "loss": 0.3833, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.377042407304827e-07, "epoch": 2.82, "percentage": 94.06, "elapsed_time": "1 day, 3:18:27", "remaining_time": "1:43:31"} +{"current_steps": 4220, "total_steps": 4476, "loss": 0.3791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0561868617312316e-07, "epoch": 2.83, "percentage": 94.28, "elapsed_time": "1 day, 3:22:21", "remaining_time": "1:39:37"} +{"current_steps": 4230, "total_steps": 4476, "loss": 0.3743, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.747447161163126e-07, "epoch": 2.83, "percentage": 94.5, "elapsed_time": "1 day, 3:26:14", "remaining_time": "1:35:44"} +{"current_steps": 4240, "total_steps": 4476, "loss": 0.3759, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4508385149375764e-07, "epoch": 2.84, "percentage": 94.73, "elapsed_time": "1 day, 3:30:08", "remaining_time": "1:31:50"} +{"current_steps": 4250, "total_steps": 4476, "loss": 0.3667, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.166375534783717e-07, "epoch": 2.85, "percentage": 94.95, "elapsed_time": "1 day, 3:34:00", "remaining_time": "1:27:57"} +{"current_steps": 4260, "total_steps": 4476, "loss": 0.3725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8940722341030126e-07, "epoch": 2.85, "percentage": 95.17, "elapsed_time": "1 day, 3:37:53", "remaining_time": "1:24:03"} +{"current_steps": 4270, "total_steps": 4476, "loss": 0.3733, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6339420272787074e-07, "epoch": 2.86, "percentage": 95.4, "elapsed_time": "1 day, 3:41:47", "remaining_time": "1:20:10"} +{"current_steps": 4280, "total_steps": 4476, "loss": 0.373, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3859977290152935e-07, "epoch": 2.87, "percentage": 95.62, "elapsed_time": "1 day, 3:45:41", "remaining_time": "1:16:16"} +{"current_steps": 4290, "total_steps": 4476, "loss": 0.3725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1502515537069334e-07, "epoch": 2.87, "percentage": 95.84, "elapsed_time": "1 day, 3:49:34", "remaining_time": "1:12:23"} +{"current_steps": 4300, "total_steps": 4476, "loss": 0.3729, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.926715114835914e-07, "epoch": 2.88, "percentage": 96.07, "elapsed_time": "1 day, 3:53:27", "remaining_time": "1:08:29"} +{"current_steps": 4310, "total_steps": 4476, "loss": 0.3742, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7153994244005766e-07, "epoch": 2.89, "percentage": 96.29, "elapsed_time": "1 day, 3:57:22", "remaining_time": "1:04:36"} +{"current_steps": 4320, "total_steps": 4476, "loss": 0.3739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.516314892372639e-07, "epoch": 2.89, "percentage": 96.51, "elapsed_time": "1 day, 4:01:15", "remaining_time": "1:00:42"} +{"current_steps": 4330, "total_steps": 4476, "loss": 0.3755, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3294713261845503e-07, "epoch": 2.9, "percentage": 96.74, "elapsed_time": "1 day, 4:05:09", "remaining_time": "0:56:49"} +{"current_steps": 4340, "total_steps": 4476, "loss": 0.3702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1548779302463231e-07, "epoch": 2.91, "percentage": 96.96, "elapsed_time": "1 day, 4:09:01", "remaining_time": "0:52:55"} +{"current_steps": 4350, "total_steps": 4476, "loss": 0.3761, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.92543305492033e-08, "epoch": 2.92, "percentage": 97.18, "elapsed_time": "1 day, 4:12:53", "remaining_time": "0:49:02"} +{"current_steps": 4360, "total_steps": 4476, "loss": 0.38, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.424754489561038e-08, "epoch": 2.92, "percentage": 97.41, "elapsed_time": "1 day, 4:16:48", "remaining_time": "0:45:08"} +{"current_steps": 4370, "total_steps": 4476, "loss": 0.3737, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.046817533795102e-08, "epoch": 2.93, "percentage": 97.63, "elapsed_time": "1 day, 4:20:42", "remaining_time": "0:41:15"} +{"current_steps": 4380, "total_steps": 4476, "loss": 0.3722, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.7916900684540366e-08, "epoch": 2.94, "percentage": 97.86, "elapsed_time": "1 day, 4:24:35", "remaining_time": "0:37:21"} +{"current_steps": 4390, "total_steps": 4476, "loss": 0.3805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6594339244479536e-08, "epoch": 2.94, "percentage": 98.08, "elapsed_time": "1 day, 4:28:28", "remaining_time": "0:33:28"} +{"current_steps": 4400, "total_steps": 4476, "loss": 0.3736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.650104879719951e-08, "epoch": 2.95, "percentage": 98.3, "elapsed_time": "1 day, 4:32:22", "remaining_time": "0:29:34"} +{"current_steps": 4410, "total_steps": 4476, "loss": 0.37, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7637526564971982e-08, "epoch": 2.96, "percentage": 98.53, "elapsed_time": "1 day, 4:36:16", "remaining_time": "0:25:41"} +{"current_steps": 4420, "total_steps": 4476, "loss": 0.374, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0004209188428937e-08, "epoch": 2.96, "percentage": 98.75, "elapsed_time": "1 day, 4:40:09", "remaining_time": "0:21:47"} +{"current_steps": 4430, "total_steps": 4476, "loss": 0.3713, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3601472705046525e-08, "epoch": 2.97, "percentage": 98.97, "elapsed_time": "1 day, 4:44:03", "remaining_time": "0:17:54"} +{"current_steps": 4440, "total_steps": 4476, "loss": 0.3686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.429632530618236e-09, "epoch": 2.98, "percentage": 99.2, "elapsed_time": "1 day, 4:47:58", "remaining_time": "0:14:00"} +{"current_steps": 4450, "total_steps": 4476, "loss": 0.3721, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.488943443711757e-09, "epoch": 2.98, "percentage": 99.42, "elapsed_time": "1 day, 4:51:52", "remaining_time": "0:10:07"} +{"current_steps": 4460, "total_steps": 4476, "loss": 0.3766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.779599573137336e-09, "epoch": 2.99, "percentage": 99.64, "elapsed_time": "1 day, 4:55:46", "remaining_time": "0:06:13"} +{"current_steps": 4470, "total_steps": 4476, "loss": 0.376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.017343883637835e-10, "epoch": 3.0, "percentage": 99.87, "elapsed_time": "1 day, 4:59:39", "remaining_time": "0:02:20"} +{"current_steps": 4476, "total_steps": 4476, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1 day, 5:01:59", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4042a6cb1ede5a28d323db367ee58f0bb4dabe53 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2710 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9994974032501256, + "eval_steps": 500, + "global_step": 4476, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.99993842168232e-05, + "loss": 1.2211, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.9997536897627915e-05, + "loss": 1.0276, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 4.9994458133418e-05, + "loss": 0.8587, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.999014807586154e-05, + "loss": 0.7431, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.9984606937283405e-05, + "loss": 0.6841, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 4.9977834990654804e-05, + "loss": 0.6452, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 4.99698325695798e-05, + "loss": 0.6347, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.9960600068278876e-05, + "loss": 0.6109, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.995013794156957e-05, + "loss": 0.5911, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 4.993844670484401e-05, + "loss": 0.5803, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 4.992552693404354e-05, + "loss": 0.5902, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 4.991137926563036e-05, + "loss": 0.5745, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 4.9896004396556176e-05, + "loss": 0.5538, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 4.987940308422783e-05, + "loss": 0.5495, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 4.986157614647005e-05, + "loss": 0.5433, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 4.984252446148508e-05, + "loss": 0.548, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 4.98222489678095e-05, + "loss": 0.5361, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 4.980075066426796e-05, + "loss": 0.5331, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 4.977803060992393e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 4.97540899240276e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 4.972892978596069e-05, + "loss": 0.5101, + "step": 210 + }, + { + "epoch": 0.15, + "learning_rate": 4.970255143517838e-05, + "loss": 0.5125, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 4.967495617114826e-05, + "loss": 0.4928, + "step": 230 + }, + { + "epoch": 0.16, + "learning_rate": 4.964614535328626e-05, + "loss": 0.4878, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 4.961612040088973e-05, + "loss": 0.5017, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 4.9584882793067534e-05, + "loss": 0.4863, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 4.955243406866713e-05, + "loss": 0.4847, + "step": 270 + }, + { + "epoch": 0.19, + "learning_rate": 4.951877582619881e-05, + "loss": 0.4868, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 4.948390972375694e-05, + "loss": 0.4748, + "step": 290 + }, + { + "epoch": 0.2, + "learning_rate": 4.944783747893825e-05, + "loss": 0.4764, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 4.941056086875727e-05, + "loss": 0.4712, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 4.937208172955876e-05, + "loss": 0.4642, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 4.9332401956927224e-05, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.23, + "learning_rate": 4.9291523505593604e-05, + "loss": 0.4709, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 4.9249448389338905e-05, + "loss": 0.461, + "step": 350 + }, + { + "epoch": 0.24, + "learning_rate": 4.920617868089501e-05, + "loss": 0.4677, + "step": 360 + }, + { + "epoch": 0.25, + "learning_rate": 4.9161716511842614e-05, + "loss": 0.4564, + "step": 370 + }, + { + "epoch": 0.25, + "learning_rate": 4.911606407250617e-05, + "loss": 0.4663, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 4.9069223611846014e-05, + "loss": 0.4682, + "step": 390 + }, + { + "epoch": 0.27, + "learning_rate": 4.9021197437347555e-05, + "loss": 0.4636, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 4.897198791490762e-05, + "loss": 0.4569, + "step": 410 + }, + { + "epoch": 0.28, + "learning_rate": 4.8921597468717887e-05, + "loss": 0.462, + "step": 420 + }, + { + "epoch": 0.29, + "learning_rate": 4.887002858114548e-05, + "loss": 0.4563, + "step": 430 + }, + { + "epoch": 0.29, + "learning_rate": 4.881728379261068e-05, + "loss": 0.4563, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 4.876336570146175e-05, + "loss": 0.4468, + "step": 450 + }, + { + "epoch": 0.31, + "learning_rate": 4.870827696384698e-05, + "loss": 0.4508, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 4.865202029358379e-05, + "loss": 0.4507, + "step": 470 + }, + { + "epoch": 0.32, + "learning_rate": 4.859459846202507e-05, + "loss": 0.4486, + "step": 480 + }, + { + "epoch": 0.33, + "learning_rate": 4.853601429792265e-05, + "loss": 0.4423, + "step": 490 + }, + { + "epoch": 0.34, + "learning_rate": 4.847627068728795e-05, + "loss": 0.4369, + "step": 500 + }, + { + "epoch": 0.34, + "learning_rate": 4.841537057324979e-05, + "loss": 0.4429, + "step": 510 + }, + { + "epoch": 0.35, + "learning_rate": 4.835331695590943e-05, + "loss": 0.4389, + "step": 520 + }, + { + "epoch": 0.36, + "learning_rate": 4.829011289219276e-05, + "loss": 0.44, + "step": 530 + }, + { + "epoch": 0.36, + "learning_rate": 4.82257614956997e-05, + "loss": 0.4476, + "step": 540 + }, + { + "epoch": 0.37, + "learning_rate": 4.816026593655085e-05, + "loss": 0.4367, + "step": 550 + }, + { + "epoch": 0.38, + "learning_rate": 4.809362944123129e-05, + "loss": 0.4357, + "step": 560 + }, + { + "epoch": 0.38, + "learning_rate": 4.802585529243164e-05, + "loss": 0.4492, + "step": 570 + }, + { + "epoch": 0.39, + "learning_rate": 4.795694682888635e-05, + "loss": 0.4403, + "step": 580 + }, + { + "epoch": 0.4, + "learning_rate": 4.7886907445209234e-05, + "loss": 0.4406, + "step": 590 + }, + { + "epoch": 0.4, + "learning_rate": 4.781574059172621e-05, + "loss": 0.4317, + "step": 600 + }, + { + "epoch": 0.41, + "learning_rate": 4.7743449774305386e-05, + "loss": 0.4379, + "step": 610 + }, + { + "epoch": 0.42, + "learning_rate": 4.7670038554184296e-05, + "loss": 0.4324, + "step": 620 + }, + { + "epoch": 0.42, + "learning_rate": 4.7595510547794465e-05, + "loss": 0.4329, + "step": 630 + }, + { + "epoch": 0.43, + "learning_rate": 4.751986942658332e-05, + "loss": 0.4259, + "step": 640 + }, + { + "epoch": 0.44, + "learning_rate": 4.744311891683325e-05, + "loss": 0.4256, + "step": 650 + }, + { + "epoch": 0.44, + "learning_rate": 4.736526279947807e-05, + "loss": 0.4289, + "step": 660 + }, + { + "epoch": 0.45, + "learning_rate": 4.728630490991676e-05, + "loss": 0.4353, + "step": 670 + }, + { + "epoch": 0.46, + "learning_rate": 4.7206249137824535e-05, + "loss": 0.4413, + "step": 680 + }, + { + "epoch": 0.46, + "learning_rate": 4.7125099426961185e-05, + "loss": 0.4302, + "step": 690 + }, + { + "epoch": 0.47, + "learning_rate": 4.704285977497687e-05, + "loss": 0.4365, + "step": 700 + }, + { + "epoch": 0.48, + "learning_rate": 4.6959534233215116e-05, + "loss": 0.4238, + "step": 710 + }, + { + "epoch": 0.48, + "learning_rate": 4.687512690651328e-05, + "loss": 0.4284, + "step": 720 + }, + { + "epoch": 0.49, + "learning_rate": 4.678964195300028e-05, + "loss": 0.4193, + "step": 730 + }, + { + "epoch": 0.5, + "learning_rate": 4.670308358389184e-05, + "loss": 0.4256, + "step": 740 + }, + { + "epoch": 0.5, + "learning_rate": 4.6615456063282944e-05, + "loss": 0.4288, + "step": 750 + }, + { + "epoch": 0.51, + "learning_rate": 4.652676370793784e-05, + "loss": 0.4335, + "step": 760 + }, + { + "epoch": 0.52, + "learning_rate": 4.643701088707736e-05, + "loss": 0.4271, + "step": 770 + }, + { + "epoch": 0.52, + "learning_rate": 4.634620202216366e-05, + "loss": 0.4304, + "step": 780 + }, + { + "epoch": 0.53, + "learning_rate": 4.625434158668246e-05, + "loss": 0.4249, + "step": 790 + }, + { + "epoch": 0.54, + "learning_rate": 4.6161434105922616e-05, + "loss": 0.4322, + "step": 800 + }, + { + "epoch": 0.54, + "learning_rate": 4.6067484156753234e-05, + "loss": 0.4229, + "step": 810 + }, + { + "epoch": 0.55, + "learning_rate": 4.597249636739815e-05, + "loss": 0.4252, + "step": 820 + }, + { + "epoch": 0.56, + "learning_rate": 4.5876475417207974e-05, + "loss": 0.413, + "step": 830 + }, + { + "epoch": 0.56, + "learning_rate": 4.577942603642959e-05, + "loss": 0.4186, + "step": 840 + }, + { + "epoch": 0.57, + "learning_rate": 4.568135300597306e-05, + "loss": 0.4233, + "step": 850 + }, + { + "epoch": 0.58, + "learning_rate": 4.5582261157176164e-05, + "loss": 0.4177, + "step": 860 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482155371566384e-05, + "loss": 0.4236, + "step": 870 + }, + { + "epoch": 0.59, + "learning_rate": 4.538104058062042e-05, + "loss": 0.4228, + "step": 880 + }, + { + "epoch": 0.6, + "learning_rate": 4.5278921765521234e-05, + "loss": 0.4181, + "step": 890 + }, + { + "epoch": 0.6, + "learning_rate": 4.51758039569127e-05, + "loss": 0.4261, + "step": 900 + }, + { + "epoch": 0.61, + "learning_rate": 4.5071692234651764e-05, + "loss": 0.4217, + "step": 910 + }, + { + "epoch": 0.62, + "learning_rate": 4.4966591727558184e-05, + "loss": 0.4191, + "step": 920 + }, + { + "epoch": 0.62, + "learning_rate": 4.48605076131619e-05, + "loss": 0.4247, + "step": 930 + }, + { + "epoch": 0.63, + "learning_rate": 4.475344511744794e-05, + "loss": 0.4236, + "step": 940 + }, + { + "epoch": 0.64, + "learning_rate": 4.464540951459902e-05, + "loss": 0.4172, + "step": 950 + }, + { + "epoch": 0.64, + "learning_rate": 4.4536406126735664e-05, + "loss": 0.4209, + "step": 960 + }, + { + "epoch": 0.65, + "learning_rate": 4.442644032365407e-05, + "loss": 0.4179, + "step": 970 + }, + { + "epoch": 0.66, + "learning_rate": 4.431551752256155e-05, + "loss": 0.4166, + "step": 980 + }, + { + "epoch": 0.66, + "learning_rate": 4.420364318780973e-05, + "loss": 0.4173, + "step": 990 + }, + { + "epoch": 0.67, + "learning_rate": 4.4090822830625236e-05, + "loss": 0.4166, + "step": 1000 + }, + { + "epoch": 0.68, + "learning_rate": 4.3977062008838307e-05, + "loss": 0.4173, + "step": 1010 + }, + { + "epoch": 0.68, + "learning_rate": 4.3862366326608975e-05, + "loss": 0.4049, + "step": 1020 + }, + { + "epoch": 0.69, + "learning_rate": 4.374674143415096e-05, + "loss": 0.4143, + "step": 1030 + }, + { + "epoch": 0.7, + "learning_rate": 4.363019302745334e-05, + "loss": 0.4219, + "step": 1040 + }, + { + "epoch": 0.7, + "learning_rate": 4.3512726847999987e-05, + "loss": 0.4152, + "step": 1050 + }, + { + "epoch": 0.71, + "learning_rate": 4.339434868248665e-05, + "loss": 0.4153, + "step": 1060 + }, + { + "epoch": 0.72, + "learning_rate": 4.3275064362535966e-05, + "loss": 0.4148, + "step": 1070 + }, + { + "epoch": 0.72, + "learning_rate": 4.315487976441014e-05, + "loss": 0.4147, + "step": 1080 + }, + { + "epoch": 0.73, + "learning_rate": 4.303380080872145e-05, + "loss": 0.41, + "step": 1090 + }, + { + "epoch": 0.74, + "learning_rate": 4.291183346014063e-05, + "loss": 0.4119, + "step": 1100 + }, + { + "epoch": 0.74, + "learning_rate": 4.278898372710296e-05, + "loss": 0.4173, + "step": 1110 + }, + { + "epoch": 0.75, + "learning_rate": 4.266525766151238e-05, + "loss": 0.4119, + "step": 1120 + }, + { + "epoch": 0.76, + "learning_rate": 4.254066135844326e-05, + "loss": 0.4163, + "step": 1130 + }, + { + "epoch": 0.76, + "learning_rate": 4.2415200955840184e-05, + "loss": 0.4104, + "step": 1140 + }, + { + "epoch": 0.77, + "learning_rate": 4.228888263421557e-05, + "loss": 0.4045, + "step": 1150 + }, + { + "epoch": 0.78, + "learning_rate": 4.216171261634521e-05, + "loss": 0.413, + "step": 1160 + }, + { + "epoch": 0.78, + "learning_rate": 4.2033697166961716e-05, + "loss": 0.4112, + "step": 1170 + }, + { + "epoch": 0.79, + "learning_rate": 4.1904842592445906e-05, + "loss": 0.4018, + "step": 1180 + }, + { + "epoch": 0.8, + "learning_rate": 4.177515524051609e-05, + "loss": 0.4068, + "step": 1190 + }, + { + "epoch": 0.8, + "learning_rate": 4.1644641499915454e-05, + "loss": 0.4029, + "step": 1200 + }, + { + "epoch": 0.81, + "learning_rate": 4.151330780009726e-05, + "loss": 0.4009, + "step": 1210 + }, + { + "epoch": 0.82, + "learning_rate": 4.1381160610908134e-05, + "loss": 0.4073, + "step": 1220 + }, + { + "epoch": 0.82, + "learning_rate": 4.124820644226936e-05, + "loss": 0.4138, + "step": 1230 + }, + { + "epoch": 0.83, + "learning_rate": 4.111445184385616e-05, + "loss": 0.4139, + "step": 1240 + }, + { + "epoch": 0.84, + "learning_rate": 4.097990340477507e-05, + "loss": 0.4062, + "step": 1250 + }, + { + "epoch": 0.84, + "learning_rate": 4.0844567753239276e-05, + "loss": 0.4044, + "step": 1260 + }, + { + "epoch": 0.85, + "learning_rate": 4.070845155624221e-05, + "loss": 0.3978, + "step": 1270 + }, + { + "epoch": 0.86, + "learning_rate": 4.0571561519228984e-05, + "loss": 0.4102, + "step": 1280 + }, + { + "epoch": 0.86, + "learning_rate": 4.043390438576616e-05, + "loss": 0.4052, + "step": 1290 + }, + { + "epoch": 0.87, + "learning_rate": 4.029548693720949e-05, + "loss": 0.4048, + "step": 1300 + }, + { + "epoch": 0.88, + "learning_rate": 4.0156315992369864e-05, + "loss": 0.4008, + "step": 1310 + }, + { + "epoch": 0.88, + "learning_rate": 4.001639840717741e-05, + "loss": 0.4038, + "step": 1320 + }, + { + "epoch": 0.89, + "learning_rate": 3.9875741074343744e-05, + "loss": 0.408, + "step": 1330 + }, + { + "epoch": 0.9, + "learning_rate": 3.973435092302239e-05, + "loss": 0.406, + "step": 1340 + }, + { + "epoch": 0.9, + "learning_rate": 3.959223491846749e-05, + "loss": 0.3991, + "step": 1350 + }, + { + "epoch": 0.91, + "learning_rate": 3.94494000616906e-05, + "loss": 0.4091, + "step": 1360 + }, + { + "epoch": 0.92, + "learning_rate": 3.93058533891159e-05, + "loss": 0.4, + "step": 1370 + }, + { + "epoch": 0.92, + "learning_rate": 3.916160197223344e-05, + "loss": 0.4112, + "step": 1380 + }, + { + "epoch": 0.93, + "learning_rate": 3.901665291725091e-05, + "loss": 0.4024, + "step": 1390 + }, + { + "epoch": 0.94, + "learning_rate": 3.887101336474346e-05, + "loss": 0.4048, + "step": 1400 + }, + { + "epoch": 0.94, + "learning_rate": 3.8724690489302004e-05, + "loss": 0.4112, + "step": 1410 + }, + { + "epoch": 0.95, + "learning_rate": 3.857769149917973e-05, + "loss": 0.3947, + "step": 1420 + }, + { + "epoch": 0.96, + "learning_rate": 3.843002363593707e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.96, + "learning_rate": 3.828169417408488e-05, + "loss": 0.3976, + "step": 1440 + }, + { + "epoch": 0.97, + "learning_rate": 3.8132710420726146e-05, + "loss": 0.4006, + "step": 1450 + }, + { + "epoch": 0.98, + "learning_rate": 3.7983079715195984e-05, + "loss": 0.398, + "step": 1460 + }, + { + "epoch": 0.99, + "learning_rate": 3.78328094287001e-05, + "loss": 0.3987, + "step": 1470 + }, + { + "epoch": 0.99, + "learning_rate": 3.768190696395162e-05, + "loss": 0.4013, + "step": 1480 + }, + { + "epoch": 1.0, + "learning_rate": 3.7530379754806494e-05, + "loss": 0.4028, + "step": 1490 + }, + { + "epoch": 1.01, + "learning_rate": 3.737823526589722e-05, + "loss": 0.4036, + "step": 1500 + }, + { + "epoch": 1.01, + "learning_rate": 3.7225480992265125e-05, + "loss": 0.3937, + "step": 1510 + }, + { + "epoch": 1.02, + "learning_rate": 3.707212445899116e-05, + "loss": 0.4007, + "step": 1520 + }, + { + "epoch": 1.03, + "learning_rate": 3.6918173220825204e-05, + "loss": 0.4004, + "step": 1530 + }, + { + "epoch": 1.03, + "learning_rate": 3.6763634861813836e-05, + "loss": 0.4004, + "step": 1540 + }, + { + "epoch": 1.04, + "learning_rate": 3.660851699492679e-05, + "loss": 0.3991, + "step": 1550 + }, + { + "epoch": 1.05, + "learning_rate": 3.645282726168191e-05, + "loss": 0.4042, + "step": 1560 + }, + { + "epoch": 1.05, + "learning_rate": 3.6296573331768664e-05, + "loss": 0.4043, + "step": 1570 + }, + { + "epoch": 1.06, + "learning_rate": 3.613976290267036e-05, + "loss": 0.3948, + "step": 1580 + }, + { + "epoch": 1.07, + "learning_rate": 3.598240369928494e-05, + "loss": 0.3952, + "step": 1590 + }, + { + "epoch": 1.07, + "learning_rate": 3.5824503473544405e-05, + "loss": 0.4002, + "step": 1600 + }, + { + "epoch": 1.08, + "learning_rate": 3.566607000403298e-05, + "loss": 0.4079, + "step": 1610 + }, + { + "epoch": 1.09, + "learning_rate": 3.5523030408223166e-05, + "loss": 0.3942, + "step": 1620 + }, + { + "epoch": 1.09, + "learning_rate": 3.5363605299319165e-05, + "loss": 0.3935, + "step": 1630 + }, + { + "epoch": 1.1, + "learning_rate": 3.520366965171161e-05, + "loss": 0.3898, + "step": 1640 + }, + { + "epoch": 1.11, + "learning_rate": 3.504323134425501e-05, + "loss": 0.4006, + "step": 1650 + }, + { + "epoch": 1.11, + "learning_rate": 3.48822982805662e-05, + "loss": 0.4089, + "step": 1660 + }, + { + "epoch": 1.12, + "learning_rate": 3.472087838863505e-05, + "loss": 0.3982, + "step": 1670 + }, + { + "epoch": 1.13, + "learning_rate": 3.455897962043387e-05, + "loss": 0.399, + "step": 1680 + }, + { + "epoch": 1.13, + "learning_rate": 3.4396609951525676e-05, + "loss": 0.3964, + "step": 1690 + }, + { + "epoch": 1.14, + "learning_rate": 3.423377738067132e-05, + "loss": 0.3909, + "step": 1700 + }, + { + "epoch": 1.15, + "learning_rate": 3.407048992943541e-05, + "loss": 0.4015, + "step": 1710 + }, + { + "epoch": 1.15, + "learning_rate": 3.39067556417912e-05, + "loss": 0.3915, + "step": 1720 + }, + { + "epoch": 1.16, + "learning_rate": 3.374258258372426e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.17, + "learning_rate": 3.357797884283517e-05, + "loss": 0.4018, + "step": 1740 + }, + { + "epoch": 1.17, + "learning_rate": 3.3412952527941096e-05, + "loss": 0.3914, + "step": 1750 + }, + { + "epoch": 1.18, + "learning_rate": 3.32475117686763e-05, + "loss": 0.3909, + "step": 1760 + }, + { + "epoch": 1.19, + "learning_rate": 3.308166471509171e-05, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 1.19, + "learning_rate": 3.2915419537253346e-05, + "loss": 0.3906, + "step": 1780 + }, + { + "epoch": 1.2, + "learning_rate": 3.274878442483991e-05, + "loss": 0.3897, + "step": 1790 + }, + { + "epoch": 1.21, + "learning_rate": 3.258176758673932e-05, + "loss": 0.3954, + "step": 1800 + }, + { + "epoch": 1.21, + "learning_rate": 3.241437725064431e-05, + "loss": 0.3835, + "step": 1810 + }, + { + "epoch": 1.22, + "learning_rate": 3.224662166264711e-05, + "loss": 0.3854, + "step": 1820 + }, + { + "epoch": 1.23, + "learning_rate": 3.207850908683322e-05, + "loss": 0.3924, + "step": 1830 + }, + { + "epoch": 1.23, + "learning_rate": 3.191004780487434e-05, + "loss": 0.3888, + "step": 1840 + }, + { + "epoch": 1.24, + "learning_rate": 3.1741246115620336e-05, + "loss": 0.3914, + "step": 1850 + }, + { + "epoch": 1.25, + "learning_rate": 3.157211233469042e-05, + "loss": 0.391, + "step": 1860 + }, + { + "epoch": 1.25, + "learning_rate": 3.140265479406358e-05, + "loss": 0.3916, + "step": 1870 + }, + { + "epoch": 1.26, + "learning_rate": 3.1232881841668015e-05, + "loss": 0.4012, + "step": 1880 + }, + { + "epoch": 1.27, + "learning_rate": 3.106280184096996e-05, + "loss": 0.3934, + "step": 1890 + }, + { + "epoch": 1.27, + "learning_rate": 3.089242317056168e-05, + "loss": 0.3908, + "step": 1900 + }, + { + "epoch": 1.28, + "learning_rate": 3.072175422374867e-05, + "loss": 0.3972, + "step": 1910 + }, + { + "epoch": 1.29, + "learning_rate": 3.055080340813623e-05, + "loss": 0.3963, + "step": 1920 + }, + { + "epoch": 1.29, + "learning_rate": 3.0379579145215287e-05, + "loss": 0.3941, + "step": 1930 + }, + { + "epoch": 1.3, + "learning_rate": 3.0208089869947475e-05, + "loss": 0.3887, + "step": 1940 + }, + { + "epoch": 1.31, + "learning_rate": 3.0036344030349644e-05, + "loss": 0.3879, + "step": 1950 + }, + { + "epoch": 1.31, + "learning_rate": 2.9864350087077702e-05, + "loss": 0.3945, + "step": 1960 + }, + { + "epoch": 1.32, + "learning_rate": 2.969211651300978e-05, + "loss": 0.3909, + "step": 1970 + }, + { + "epoch": 1.33, + "learning_rate": 2.9519651792828877e-05, + "loss": 0.3871, + "step": 1980 + }, + { + "epoch": 1.33, + "learning_rate": 2.9346964422604846e-05, + "loss": 0.3803, + "step": 1990 + }, + { + "epoch": 1.34, + "learning_rate": 2.9174062909375892e-05, + "loss": 0.3868, + "step": 2000 + }, + { + "epoch": 1.35, + "learning_rate": 2.9000955770729464e-05, + "loss": 0.385, + "step": 2010 + }, + { + "epoch": 1.35, + "learning_rate": 2.8827651534382655e-05, + "loss": 0.3871, + "step": 2020 + }, + { + "epoch": 1.36, + "learning_rate": 2.8654158737762122e-05, + "loss": 0.3956, + "step": 2030 + }, + { + "epoch": 1.37, + "learning_rate": 2.8480485927583506e-05, + "loss": 0.3884, + "step": 2040 + }, + { + "epoch": 1.37, + "learning_rate": 2.8306641659430382e-05, + "loss": 0.3829, + "step": 2050 + }, + { + "epoch": 1.38, + "learning_rate": 2.8132634497332815e-05, + "loss": 0.3916, + "step": 2060 + }, + { + "epoch": 1.39, + "learning_rate": 2.7958473013345447e-05, + "loss": 0.3924, + "step": 2070 + }, + { + "epoch": 1.39, + "learning_rate": 2.7784165787125226e-05, + "loss": 0.3906, + "step": 2080 + }, + { + "epoch": 1.4, + "learning_rate": 2.7609721405508758e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 1.41, + "learning_rate": 2.7435148462089282e-05, + "loss": 0.3892, + "step": 2100 + }, + { + "epoch": 1.41, + "learning_rate": 2.7260455556793325e-05, + "loss": 0.3866, + "step": 2110 + }, + { + "epoch": 1.42, + "learning_rate": 2.708565129545706e-05, + "loss": 0.382, + "step": 2120 + }, + { + "epoch": 1.43, + "learning_rate": 2.691074428940237e-05, + "loss": 0.3825, + "step": 2130 + }, + { + "epoch": 1.43, + "learning_rate": 2.673574315501259e-05, + "loss": 0.3828, + "step": 2140 + }, + { + "epoch": 1.44, + "learning_rate": 2.656065651330808e-05, + "loss": 0.3845, + "step": 2150 + }, + { + "epoch": 1.45, + "learning_rate": 2.6385492989521522e-05, + "loss": 0.3904, + "step": 2160 + }, + { + "epoch": 1.45, + "learning_rate": 2.6210261212673004e-05, + "loss": 0.3934, + "step": 2170 + }, + { + "epoch": 1.46, + "learning_rate": 2.6034969815144938e-05, + "loss": 0.3893, + "step": 2180 + }, + { + "epoch": 1.47, + "learning_rate": 2.5859627432256816e-05, + "loss": 0.3965, + "step": 2190 + }, + { + "epoch": 1.47, + "learning_rate": 2.568424270183981e-05, + "loss": 0.3833, + "step": 2200 + }, + { + "epoch": 1.48, + "learning_rate": 2.550882426381123e-05, + "loss": 0.3822, + "step": 2210 + }, + { + "epoch": 1.49, + "learning_rate": 2.5333380759748925e-05, + "loss": 0.3823, + "step": 2220 + }, + { + "epoch": 1.49, + "learning_rate": 2.515792083246556e-05, + "loss": 0.3861, + "step": 2230 + }, + { + "epoch": 1.5, + "learning_rate": 2.4982453125582834e-05, + "loss": 0.3861, + "step": 2240 + }, + { + "epoch": 1.51, + "learning_rate": 2.4806986283105712e-05, + "loss": 0.3909, + "step": 2250 + }, + { + "epoch": 1.51, + "learning_rate": 2.463152894899658e-05, + "loss": 0.3854, + "step": 2260 + }, + { + "epoch": 1.52, + "learning_rate": 2.445608976674939e-05, + "loss": 0.394, + "step": 2270 + }, + { + "epoch": 1.53, + "learning_rate": 2.4280677378963906e-05, + "loss": 0.3866, + "step": 2280 + }, + { + "epoch": 1.53, + "learning_rate": 2.410530042691992e-05, + "loss": 0.3879, + "step": 2290 + }, + { + "epoch": 1.54, + "learning_rate": 2.3929967550151568e-05, + "loss": 0.388, + "step": 2300 + }, + { + "epoch": 1.55, + "learning_rate": 2.375468738602171e-05, + "loss": 0.3868, + "step": 2310 + }, + { + "epoch": 1.55, + "learning_rate": 2.3579468569296464e-05, + "loss": 0.3769, + "step": 2320 + }, + { + "epoch": 1.56, + "learning_rate": 2.340431973171978e-05, + "loss": 0.3811, + "step": 2330 + }, + { + "epoch": 1.57, + "learning_rate": 2.3229249501588278e-05, + "loss": 0.3805, + "step": 2340 + }, + { + "epoch": 1.57, + "learning_rate": 2.3054266503326165e-05, + "loss": 0.3822, + "step": 2350 + }, + { + "epoch": 1.58, + "learning_rate": 2.2879379357060345e-05, + "loss": 0.3875, + "step": 2360 + }, + { + "epoch": 1.59, + "learning_rate": 2.2704596678195827e-05, + "loss": 0.3884, + "step": 2370 + }, + { + "epoch": 1.59, + "learning_rate": 2.2529927076991283e-05, + "loss": 0.381, + "step": 2380 + }, + { + "epoch": 1.6, + "learning_rate": 2.2355379158134843e-05, + "loss": 0.3933, + "step": 2390 + }, + { + "epoch": 1.61, + "learning_rate": 2.2180961520320278e-05, + "loss": 0.3853, + "step": 2400 + }, + { + "epoch": 1.62, + "learning_rate": 2.2006682755823367e-05, + "loss": 0.3871, + "step": 2410 + }, + { + "epoch": 1.62, + "learning_rate": 2.1832551450078594e-05, + "loss": 0.3867, + "step": 2420 + }, + { + "epoch": 1.63, + "learning_rate": 2.165857618125625e-05, + "loss": 0.3878, + "step": 2430 + }, + { + "epoch": 1.64, + "learning_rate": 2.1484765519839843e-05, + "loss": 0.386, + "step": 2440 + }, + { + "epoch": 1.64, + "learning_rate": 2.1311128028203863e-05, + "loss": 0.3836, + "step": 2450 + }, + { + "epoch": 1.65, + "learning_rate": 2.1137672260192004e-05, + "loss": 0.3849, + "step": 2460 + }, + { + "epoch": 1.66, + "learning_rate": 2.09644067606958e-05, + "loss": 0.3841, + "step": 2470 + }, + { + "epoch": 1.66, + "learning_rate": 2.079134006523359e-05, + "loss": 0.3848, + "step": 2480 + }, + { + "epoch": 1.67, + "learning_rate": 2.061848069953017e-05, + "loss": 0.3817, + "step": 2490 + }, + { + "epoch": 1.68, + "learning_rate": 2.044583717909667e-05, + "loss": 0.3839, + "step": 2500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0273418008811125e-05, + "loss": 0.3749, + "step": 2510 + }, + { + "epoch": 1.69, + "learning_rate": 2.0101231682499506e-05, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 1.7, + "learning_rate": 1.9929286682517235e-05, + "loss": 0.3854, + "step": 2530 + }, + { + "epoch": 1.7, + "learning_rate": 1.9757591479331382e-05, + "loss": 0.3894, + "step": 2540 + }, + { + "epoch": 1.71, + "learning_rate": 1.9586154531103373e-05, + "loss": 0.3871, + "step": 2550 + }, + { + "epoch": 1.72, + "learning_rate": 1.9414984283272286e-05, + "loss": 0.382, + "step": 2560 + }, + { + "epoch": 1.72, + "learning_rate": 1.9244089168138836e-05, + "loss": 0.3858, + "step": 2570 + }, + { + "epoch": 1.73, + "learning_rate": 1.9073477604449985e-05, + "loss": 0.3819, + "step": 2580 + }, + { + "epoch": 1.74, + "learning_rate": 1.8903157996984174e-05, + "loss": 0.3795, + "step": 2590 + }, + { + "epoch": 1.74, + "learning_rate": 1.873313873613733e-05, + "loss": 0.3827, + "step": 2600 + }, + { + "epoch": 1.75, + "learning_rate": 1.8563428197509502e-05, + "loss": 0.385, + "step": 2610 + }, + { + "epoch": 1.76, + "learning_rate": 1.839403474149225e-05, + "loss": 0.3846, + "step": 2620 + }, + { + "epoch": 1.76, + "learning_rate": 1.8224966712856806e-05, + "loss": 0.3828, + "step": 2630 + }, + { + "epoch": 1.77, + "learning_rate": 1.8056232440343013e-05, + "loss": 0.3792, + "step": 2640 + }, + { + "epoch": 1.78, + "learning_rate": 1.788784023624896e-05, + "loss": 0.3814, + "step": 2650 + }, + { + "epoch": 1.78, + "learning_rate": 1.7719798396021558e-05, + "loss": 0.3819, + "step": 2660 + }, + { + "epoch": 1.79, + "learning_rate": 1.7552115197847884e-05, + "loss": 0.3798, + "step": 2670 + }, + { + "epoch": 1.8, + "learning_rate": 1.7384798902247316e-05, + "loss": 0.3772, + "step": 2680 + }, + { + "epoch": 1.8, + "learning_rate": 1.7217857751664663e-05, + "loss": 0.3758, + "step": 2690 + }, + { + "epoch": 1.81, + "learning_rate": 1.7051299970064098e-05, + "loss": 0.3706, + "step": 2700 + }, + { + "epoch": 1.82, + "learning_rate": 1.6885133762523985e-05, + "loss": 0.3805, + "step": 2710 + }, + { + "epoch": 1.82, + "learning_rate": 1.6719367314832756e-05, + "loss": 0.3892, + "step": 2720 + }, + { + "epoch": 1.83, + "learning_rate": 1.65540087930856e-05, + "loss": 0.387, + "step": 2730 + }, + { + "epoch": 1.84, + "learning_rate": 1.6389066343282168e-05, + "loss": 0.3773, + "step": 2740 + }, + { + "epoch": 1.84, + "learning_rate": 1.6224548090925323e-05, + "loss": 0.3829, + "step": 2750 + }, + { + "epoch": 1.85, + "learning_rate": 1.6060462140620835e-05, + "loss": 0.3697, + "step": 2760 + }, + { + "epoch": 1.86, + "learning_rate": 1.589681657567811e-05, + "loss": 0.3817, + "step": 2770 + }, + { + "epoch": 1.86, + "learning_rate": 1.5733619457712037e-05, + "loss": 0.3819, + "step": 2780 + }, + { + "epoch": 1.87, + "learning_rate": 1.5570878826245773e-05, + "loss": 0.3756, + "step": 2790 + }, + { + "epoch": 1.88, + "learning_rate": 1.5408602698314777e-05, + "loss": 0.3791, + "step": 2800 + }, + { + "epoch": 1.88, + "learning_rate": 1.5246799068071818e-05, + "loss": 0.3765, + "step": 2810 + }, + { + "epoch": 1.89, + "learning_rate": 1.5085475906393153e-05, + "loss": 0.3834, + "step": 2820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4924641160485923e-05, + "loss": 0.381, + "step": 2830 + }, + { + "epoch": 1.9, + "learning_rate": 1.4764302753496584e-05, + "loss": 0.3766, + "step": 2840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4604468584120607e-05, + "loss": 0.3815, + "step": 2850 + }, + { + "epoch": 1.92, + "learning_rate": 1.4445146526213415e-05, + "loss": 0.3774, + "step": 2860 + }, + { + "epoch": 1.92, + "learning_rate": 1.4286344428402454e-05, + "loss": 0.3879, + "step": 2870 + }, + { + "epoch": 1.93, + "learning_rate": 1.412807011370052e-05, + "loss": 0.3777, + "step": 2880 + }, + { + "epoch": 1.94, + "learning_rate": 1.3970331379120455e-05, + "loss": 0.3806, + "step": 2890 + }, + { + "epoch": 1.94, + "learning_rate": 1.3813135995290988e-05, + "loss": 0.3848, + "step": 2900 + }, + { + "epoch": 1.95, + "learning_rate": 1.3656491706073935e-05, + "loss": 0.3745, + "step": 2910 + }, + { + "epoch": 1.96, + "learning_rate": 1.350040622818275e-05, + "loss": 0.377, + "step": 2920 + }, + { + "epoch": 1.96, + "learning_rate": 1.3344887250802345e-05, + "loss": 0.3783, + "step": 2930 + }, + { + "epoch": 1.97, + "learning_rate": 1.3189942435210301e-05, + "loss": 0.3768, + "step": 2940 + }, + { + "epoch": 1.98, + "learning_rate": 1.303557941439949e-05, + "loss": 0.3744, + "step": 2950 + }, + { + "epoch": 1.98, + "learning_rate": 1.2881805792702031e-05, + "loss": 0.3788, + "step": 2960 + }, + { + "epoch": 1.99, + "learning_rate": 1.2728629145414645e-05, + "loss": 0.3735, + "step": 2970 + }, + { + "epoch": 2.0, + "learning_rate": 1.257605701842554e-05, + "loss": 0.3819, + "step": 2980 + }, + { + "epoch": 2.0, + "learning_rate": 1.242409692784265e-05, + "loss": 0.3812, + "step": 2990 + }, + { + "epoch": 2.01, + "learning_rate": 1.2272756359623342e-05, + "loss": 0.3769, + "step": 3000 + }, + { + "epoch": 2.02, + "learning_rate": 1.2122042769205702e-05, + "loss": 0.3779, + "step": 3010 + }, + { + "epoch": 2.02, + "learning_rate": 1.1971963581141196e-05, + "loss": 0.3817, + "step": 3020 + }, + { + "epoch": 2.03, + "learning_rate": 1.1822526188728966e-05, + "loss": 0.3788, + "step": 3030 + }, + { + "epoch": 2.04, + "learning_rate": 1.1673737953651601e-05, + "loss": 0.3776, + "step": 3040 + }, + { + "epoch": 2.04, + "learning_rate": 1.1525606205612447e-05, + "loss": 0.3785, + "step": 3050 + }, + { + "epoch": 2.05, + "learning_rate": 1.1378138241974595e-05, + "loss": 0.3858, + "step": 3060 + }, + { + "epoch": 2.06, + "learning_rate": 1.1231341327401323e-05, + "loss": 0.3766, + "step": 3070 + }, + { + "epoch": 2.06, + "learning_rate": 1.1085222693498256e-05, + "loss": 0.3766, + "step": 3080 + }, + { + "epoch": 2.07, + "learning_rate": 1.093978953845713e-05, + "loss": 0.3795, + "step": 3090 + }, + { + "epoch": 2.08, + "learning_rate": 1.079504902670117e-05, + "loss": 0.3837, + "step": 3100 + }, + { + "epoch": 2.08, + "learning_rate": 1.065100828853213e-05, + "loss": 0.377, + "step": 3110 + }, + { + "epoch": 2.09, + "learning_rate": 1.0507674419779085e-05, + "loss": 0.3759, + "step": 3120 + }, + { + "epoch": 2.1, + "learning_rate": 1.0365054481448849e-05, + "loss": 0.3704, + "step": 3130 + }, + { + "epoch": 2.1, + "learning_rate": 1.02231554993781e-05, + "loss": 0.3751, + "step": 3140 + }, + { + "epoch": 2.11, + "learning_rate": 1.0081984463887325e-05, + "loss": 0.396, + "step": 3150 + }, + { + "epoch": 2.12, + "learning_rate": 9.941548329436425e-06, + "loss": 0.3788, + "step": 3160 + }, + { + "epoch": 2.12, + "learning_rate": 9.801854014282108e-06, + "loss": 0.3767, + "step": 3170 + }, + { + "epoch": 2.13, + "learning_rate": 9.662908400137125e-06, + "loss": 0.3783, + "step": 3180 + }, + { + "epoch": 2.14, + "learning_rate": 9.524718331831186e-06, + "loss": 0.3775, + "step": 3190 + }, + { + "epoch": 2.14, + "learning_rate": 9.387290616973859e-06, + "loss": 0.3789, + "step": 3200 + }, + { + "epoch": 2.15, + "learning_rate": 9.250632025619104e-06, + "loss": 0.3776, + "step": 3210 + }, + { + "epoch": 2.16, + "learning_rate": 9.11474928993187e-06, + "loss": 0.368, + "step": 3220 + }, + { + "epoch": 2.16, + "learning_rate": 8.979649103856345e-06, + "loss": 0.378, + "step": 3230 + }, + { + "epoch": 2.17, + "learning_rate": 8.84533812278629e-06, + "loss": 0.3776, + "step": 3240 + }, + { + "epoch": 2.18, + "learning_rate": 8.711822963237093e-06, + "loss": 0.3731, + "step": 3250 + }, + { + "epoch": 2.18, + "learning_rate": 8.579110202519894e-06, + "loss": 0.3827, + "step": 3260 + }, + { + "epoch": 2.19, + "learning_rate": 8.447206378417533e-06, + "loss": 0.3725, + "step": 3270 + }, + { + "epoch": 2.2, + "learning_rate": 8.31611798886246e-06, + "loss": 0.372, + "step": 3280 + }, + { + "epoch": 2.2, + "learning_rate": 8.185851491616677e-06, + "loss": 0.3753, + "step": 3290 + }, + { + "epoch": 2.21, + "learning_rate": 8.0564133039536e-06, + "loss": 0.3732, + "step": 3300 + }, + { + "epoch": 2.22, + "learning_rate": 7.927809802341876e-06, + "loss": 0.37, + "step": 3310 + }, + { + "epoch": 2.22, + "learning_rate": 7.800047322131346e-06, + "loss": 0.372, + "step": 3320 + }, + { + "epoch": 2.23, + "learning_rate": 7.673132157240877e-06, + "loss": 0.3734, + "step": 3330 + }, + { + "epoch": 2.24, + "learning_rate": 7.5470705598483405e-06, + "loss": 0.3734, + "step": 3340 + }, + { + "epoch": 2.24, + "learning_rate": 7.4218687400826075e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 2.25, + "learning_rate": 7.297532865717638e-06, + "loss": 0.3715, + "step": 3360 + }, + { + "epoch": 2.26, + "learning_rate": 7.174069061868591e-06, + "loss": 0.3836, + "step": 3370 + }, + { + "epoch": 2.27, + "learning_rate": 7.05148341069014e-06, + "loss": 0.3784, + "step": 3380 + }, + { + "epoch": 2.27, + "learning_rate": 6.929781951076836e-06, + "loss": 0.3737, + "step": 3390 + }, + { + "epoch": 2.28, + "learning_rate": 6.80897067836557e-06, + "loss": 0.3827, + "step": 3400 + }, + { + "epoch": 2.29, + "learning_rate": 6.6890555440403015e-06, + "loss": 0.3808, + "step": 3410 + }, + { + "epoch": 2.29, + "learning_rate": 6.570042455438822e-06, + "loss": 0.3797, + "step": 3420 + }, + { + "epoch": 2.3, + "learning_rate": 6.451937275461736e-06, + "loss": 0.3739, + "step": 3430 + }, + { + "epoch": 2.31, + "learning_rate": 6.334745822283699e-06, + "loss": 0.3748, + "step": 3440 + }, + { + "epoch": 2.31, + "learning_rate": 6.2184738690667214e-06, + "loss": 0.3769, + "step": 3450 + }, + { + "epoch": 2.32, + "learning_rate": 6.103127143675832e-06, + "loss": 0.3756, + "step": 3460 + }, + { + "epoch": 2.33, + "learning_rate": 5.988711328396859e-06, + "loss": 0.3738, + "step": 3470 + }, + { + "epoch": 2.33, + "learning_rate": 5.875232059656552e-06, + "loss": 0.3676, + "step": 3480 + }, + { + "epoch": 2.34, + "learning_rate": 5.762694927744866e-06, + "loss": 0.3737, + "step": 3490 + }, + { + "epoch": 2.35, + "learning_rate": 5.651105476539623e-06, + "loss": 0.369, + "step": 3500 + }, + { + "epoch": 2.35, + "learning_rate": 5.540469203233347e-06, + "loss": 0.3723, + "step": 3510 + }, + { + "epoch": 2.36, + "learning_rate": 5.430791558062518e-06, + "loss": 0.3791, + "step": 3520 + }, + { + "epoch": 2.37, + "learning_rate": 5.322077944039039e-06, + "loss": 0.3753, + "step": 3530 + }, + { + "epoch": 2.37, + "learning_rate": 5.21433371668407e-06, + "loss": 0.3703, + "step": 3540 + }, + { + "epoch": 2.38, + "learning_rate": 5.107564183764219e-06, + "loss": 0.3781, + "step": 3550 + }, + { + "epoch": 2.39, + "learning_rate": 5.001774605030074e-06, + "loss": 0.3766, + "step": 3560 + }, + { + "epoch": 2.39, + "learning_rate": 4.8969701919570454e-06, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.4, + "learning_rate": 4.7931561074887e-06, + "loss": 0.3681, + "step": 3580 + }, + { + "epoch": 2.41, + "learning_rate": 4.690337465782366e-06, + "loss": 0.3752, + "step": 3590 + }, + { + "epoch": 2.41, + "learning_rate": 4.588519331957241e-06, + "loss": 0.3775, + "step": 3600 + }, + { + "epoch": 2.42, + "learning_rate": 4.4877067218448285e-06, + "loss": 0.3677, + "step": 3610 + }, + { + "epoch": 2.43, + "learning_rate": 4.38790460174188e-06, + "loss": 0.3718, + "step": 3620 + }, + { + "epoch": 2.43, + "learning_rate": 4.289117888165708e-06, + "loss": 0.3671, + "step": 3630 + }, + { + "epoch": 2.44, + "learning_rate": 4.191351447612032e-06, + "loss": 0.3728, + "step": 3640 + }, + { + "epoch": 2.45, + "learning_rate": 4.094610096315199e-06, + "loss": 0.3769, + "step": 3650 + }, + { + "epoch": 2.45, + "learning_rate": 3.998898600010928e-06, + "loss": 0.3777, + "step": 3660 + }, + { + "epoch": 2.46, + "learning_rate": 3.904221673701566e-06, + "loss": 0.3817, + "step": 3670 + }, + { + "epoch": 2.47, + "learning_rate": 3.810583981423796e-06, + "loss": 0.383, + "step": 3680 + }, + { + "epoch": 2.47, + "learning_rate": 3.7179901360188533e-06, + "loss": 0.3719, + "step": 3690 + }, + { + "epoch": 2.48, + "learning_rate": 3.626444698905329e-06, + "loss": 0.3716, + "step": 3700 + }, + { + "epoch": 2.49, + "learning_rate": 3.5359521798544347e-06, + "loss": 0.3736, + "step": 3710 + }, + { + "epoch": 2.49, + "learning_rate": 3.4465170367678294e-06, + "loss": 0.3741, + "step": 3720 + }, + { + "epoch": 2.5, + "learning_rate": 3.3581436754580363e-06, + "loss": 0.3756, + "step": 3730 + }, + { + "epoch": 2.51, + "learning_rate": 3.270836449431397e-06, + "loss": 0.3777, + "step": 3740 + }, + { + "epoch": 2.51, + "learning_rate": 3.184599659673579e-06, + "loss": 0.3774, + "step": 3750 + }, + { + "epoch": 2.52, + "learning_rate": 3.0994375544377424e-06, + "loss": 0.3785, + "step": 3760 + }, + { + "epoch": 2.53, + "learning_rate": 3.0153543290352164e-06, + "loss": 0.3768, + "step": 3770 + }, + { + "epoch": 2.53, + "learning_rate": 2.932354125628853e-06, + "loss": 0.377, + "step": 3780 + }, + { + "epoch": 2.54, + "learning_rate": 2.8504410330289778e-06, + "loss": 0.3803, + "step": 3790 + }, + { + "epoch": 2.55, + "learning_rate": 2.769619086491923e-06, + "loss": 0.3706, + "step": 3800 + }, + { + "epoch": 2.55, + "learning_rate": 2.6898922675213016e-06, + "loss": 0.3712, + "step": 3810 + }, + { + "epoch": 2.56, + "learning_rate": 2.611264503671823e-06, + "loss": 0.369, + "step": 3820 + }, + { + "epoch": 2.57, + "learning_rate": 2.533739668355814e-06, + "loss": 0.3726, + "step": 3830 + }, + { + "epoch": 2.57, + "learning_rate": 2.45732158065243e-06, + "loss": 0.3746, + "step": 3840 + }, + { + "epoch": 2.58, + "learning_rate": 2.382014005119501e-06, + "loss": 0.3701, + "step": 3850 + }, + { + "epoch": 2.59, + "learning_rate": 2.3078206516080695e-06, + "loss": 0.38, + "step": 3860 + }, + { + "epoch": 2.59, + "learning_rate": 2.2347451750796474e-06, + "loss": 0.3725, + "step": 3870 + }, + { + "epoch": 2.6, + "learning_rate": 2.1627911754261653e-06, + "loss": 0.3825, + "step": 3880 + }, + { + "epoch": 2.61, + "learning_rate": 2.0919621972926156e-06, + "loss": 0.3791, + "step": 3890 + }, + { + "epoch": 2.61, + "learning_rate": 2.022261729902458e-06, + "loss": 0.3778, + "step": 3900 + }, + { + "epoch": 2.62, + "learning_rate": 1.953693206885715e-06, + "loss": 0.3735, + "step": 3910 + }, + { + "epoch": 2.63, + "learning_rate": 1.8862600061098106e-06, + "loss": 0.3816, + "step": 3920 + }, + { + "epoch": 2.63, + "learning_rate": 1.8199654495131974e-06, + "loss": 0.3752, + "step": 3930 + }, + { + "epoch": 2.64, + "learning_rate": 1.754812802941691e-06, + "loss": 0.3739, + "step": 3940 + }, + { + "epoch": 2.65, + "learning_rate": 1.6908052759875836e-06, + "loss": 0.3745, + "step": 3950 + }, + { + "epoch": 2.65, + "learning_rate": 1.6279460218315361e-06, + "loss": 0.3753, + "step": 3960 + }, + { + "epoch": 2.66, + "learning_rate": 1.5662381370872532e-06, + "loss": 0.3736, + "step": 3970 + }, + { + "epoch": 2.67, + "learning_rate": 1.5056846616489124e-06, + "loss": 0.3755, + "step": 3980 + }, + { + "epoch": 2.67, + "learning_rate": 1.4462885785414327e-06, + "loss": 0.3741, + "step": 3990 + }, + { + "epoch": 2.68, + "learning_rate": 1.3880528137735132e-06, + "loss": 0.3708, + "step": 4000 + }, + { + "epoch": 2.69, + "learning_rate": 1.3309802361934936e-06, + "loss": 0.3703, + "step": 4010 + }, + { + "epoch": 2.69, + "learning_rate": 1.2750736573480248e-06, + "loss": 0.3784, + "step": 4020 + }, + { + "epoch": 2.7, + "learning_rate": 1.2203358313435609e-06, + "loss": 0.3785, + "step": 4030 + }, + { + "epoch": 2.71, + "learning_rate": 1.1667694547106978e-06, + "loss": 0.3832, + "step": 4040 + }, + { + "epoch": 2.71, + "learning_rate": 1.1143771662713214e-06, + "loss": 0.3708, + "step": 4050 + }, + { + "epoch": 2.72, + "learning_rate": 1.063161547008612e-06, + "loss": 0.3777, + "step": 4060 + }, + { + "epoch": 2.73, + "learning_rate": 1.0131251199399089e-06, + "loss": 0.375, + "step": 4070 + }, + { + "epoch": 2.73, + "learning_rate": 9.642703499924216e-07, + "loss": 0.3719, + "step": 4080 + }, + { + "epoch": 2.74, + "learning_rate": 9.16599643881777e-07, + "loss": 0.3776, + "step": 4090 + }, + { + "epoch": 2.75, + "learning_rate": 8.701153499934833e-07, + "loss": 0.377, + "step": 4100 + }, + { + "epoch": 2.75, + "learning_rate": 8.248197582672395e-07, + "loss": 0.3759, + "step": 4110 + }, + { + "epoch": 2.76, + "learning_rate": 7.807151000841118e-07, + "loss": 0.3727, + "step": 4120 + }, + { + "epoch": 2.77, + "learning_rate": 7.378035481566181e-07, + "loss": 0.374, + "step": 4130 + }, + { + "epoch": 2.77, + "learning_rate": 6.960872164217064e-07, + "loss": 0.3792, + "step": 4140 + }, + { + "epoch": 2.78, + "learning_rate": 6.555681599365926e-07, + "loss": 0.3692, + "step": 4150 + }, + { + "epoch": 2.79, + "learning_rate": 6.16248374777545e-07, + "loss": 0.3736, + "step": 4160 + }, + { + "epoch": 2.79, + "learning_rate": 5.781297979415456e-07, + "loss": 0.3695, + "step": 4170 + }, + { + "epoch": 2.8, + "learning_rate": 5.412143072508563e-07, + "loss": 0.3716, + "step": 4180 + }, + { + "epoch": 2.81, + "learning_rate": 5.055037212605279e-07, + "loss": 0.364, + "step": 4190 + }, + { + "epoch": 2.81, + "learning_rate": 4.709997991688114e-07, + "loss": 0.3707, + "step": 4200 + }, + { + "epoch": 2.82, + "learning_rate": 4.377042407304827e-07, + "loss": 0.3833, + "step": 4210 + }, + { + "epoch": 2.83, + "learning_rate": 4.0561868617312316e-07, + "loss": 0.3791, + "step": 4220 + }, + { + "epoch": 2.83, + "learning_rate": 3.747447161163126e-07, + "loss": 0.3743, + "step": 4230 + }, + { + "epoch": 2.84, + "learning_rate": 3.4508385149375764e-07, + "loss": 0.3759, + "step": 4240 + }, + { + "epoch": 2.85, + "learning_rate": 3.166375534783717e-07, + "loss": 0.3667, + "step": 4250 + }, + { + "epoch": 2.85, + "learning_rate": 2.8940722341030126e-07, + "loss": 0.3725, + "step": 4260 + }, + { + "epoch": 2.86, + "learning_rate": 2.6339420272787074e-07, + "loss": 0.3733, + "step": 4270 + }, + { + "epoch": 2.87, + "learning_rate": 2.3859977290152935e-07, + "loss": 0.373, + "step": 4280 + }, + { + "epoch": 2.87, + "learning_rate": 2.1502515537069334e-07, + "loss": 0.3725, + "step": 4290 + }, + { + "epoch": 2.88, + "learning_rate": 1.926715114835914e-07, + "loss": 0.3729, + "step": 4300 + }, + { + "epoch": 2.89, + "learning_rate": 1.7153994244005766e-07, + "loss": 0.3742, + "step": 4310 + }, + { + "epoch": 2.89, + "learning_rate": 1.516314892372639e-07, + "loss": 0.3739, + "step": 4320 + }, + { + "epoch": 2.9, + "learning_rate": 1.3294713261845503e-07, + "loss": 0.3755, + "step": 4330 + }, + { + "epoch": 2.91, + "learning_rate": 1.1548779302463231e-07, + "loss": 0.3702, + "step": 4340 + }, + { + "epoch": 2.92, + "learning_rate": 9.92543305492033e-08, + "loss": 0.3761, + "step": 4350 + }, + { + "epoch": 2.92, + "learning_rate": 8.424754489561038e-08, + "loss": 0.38, + "step": 4360 + }, + { + "epoch": 2.93, + "learning_rate": 7.046817533795102e-08, + "loss": 0.3737, + "step": 4370 + }, + { + "epoch": 2.94, + "learning_rate": 5.7916900684540366e-08, + "loss": 0.3722, + "step": 4380 + }, + { + "epoch": 2.94, + "learning_rate": 4.6594339244479536e-08, + "loss": 0.3805, + "step": 4390 + }, + { + "epoch": 2.95, + "learning_rate": 3.650104879719951e-08, + "loss": 0.3736, + "step": 4400 + }, + { + "epoch": 2.96, + "learning_rate": 2.7637526564971982e-08, + "loss": 0.37, + "step": 4410 + }, + { + "epoch": 2.96, + "learning_rate": 2.0004209188428937e-08, + "loss": 0.374, + "step": 4420 + }, + { + "epoch": 2.97, + "learning_rate": 1.3601472705046525e-08, + "loss": 0.3713, + "step": 4430 + }, + { + "epoch": 2.98, + "learning_rate": 8.429632530618236e-09, + "loss": 0.3686, + "step": 4440 + }, + { + "epoch": 2.98, + "learning_rate": 4.488943443711757e-09, + "loss": 0.3721, + "step": 4450 + }, + { + "epoch": 2.99, + "learning_rate": 1.779599573137336e-09, + "loss": 0.3766, + "step": 4460 + }, + { + "epoch": 3.0, + "learning_rate": 3.017343883637835e-10, + "loss": 0.376, + "step": 4470 + }, + { + "epoch": 3.0, + "step": 4476, + "total_flos": 4.0721010483030655e+19, + "train_loss": 0.40742741023141216, + "train_runtime": 104521.3326, + "train_samples_per_second": 10.964, + "train_steps_per_second": 0.043 + } + ], + "logging_steps": 10, + "max_steps": 4476, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 4.0721010483030655e+19, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bee1ee30d7d7cc334212c9eb4b2b33b8c7d60a8 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729bf37ae27da0051469b6c2d9a7528c72ecfe49e138964d0506deffbecbf5dd +size 4283 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c6fd2335ab064b04e2c49d8853484e3fb997d6b0 Binary files /dev/null and b/training_loss.png differ