diff --git "a/train_eval_log.txt" "b/train_eval_log.txt" new file mode 100644--- /dev/null +++ "b/train_eval_log.txt" @@ -0,0 +1,2534 @@ +The following values were not passed to `accelerate launch` and had defaults used instead: + `--num_processes` was set to a value of `2` + More than one GPU was found, enabling multi-GPU training. + If this was unintended please pass in `--num_processes=1`. + `--num_machines` was set to a value of `1` + `--mixed_precision` was set to a value of `'no'` + `--dynamo_backend` was set to a value of `'no'` +To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`. +Using RTX 3090 or 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled. +01/18/2024 18:29:34 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training. +01/18/2024 18:29:34 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training. +01/18/2024 18:29:34 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +[INFO|training_args.py:1838] 2024-01-18 18:29:34,925 >> PyTorch: setting up devices +/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +01/18/2024 18:29:34 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1 + distributed training: True, compute dtype: None +01/18/2024 18:29:34 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=IntervalStrategy.EPOCH, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora/runs/Jan18_18-29-34_yhyu13fuwuqi, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.COSINE, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +output_dir=./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=1, +per_device_train_batch_size=1, +predict_with_generate=False, +prediction_loss_only=True, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=1000, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=None, +seed=42, +skip_memory_metrics=True, +sortish_sampler=False, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +01/18/2024 18:29:34 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2-llama-factory-convert/simple-function-calling-v2_converted_2000.json... +01/18/2024 18:29:34 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +01/18/2024 18:29:35 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training. +01/18/2024 18:29:35 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training. +01/18/2024 18:29:35 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead. + warnings.warn( +01/18/2024 18:29:35 - INFO - llmtuner.model.parser - Process rank: 1, device: cuda:1, n_gpu: 1 + distributed training: True, compute dtype: None +01/18/2024 18:29:35 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=False, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=IntervalStrategy.EPOCH, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora/runs/Jan18_18-29-34_yhyu13fuwuqi, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.COSINE, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +output_dir=./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=1, +per_device_train_batch_size=1, +predict_with_generate=False, +prediction_loss_only=True, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=1000, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=None, +seed=42, +skip_memory_metrics=True, +sortish_sampler=False, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +01/18/2024 18:29:35 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2-llama-factory-convert/simple-function-calling-v2_converted_2000.json... +01/18/2024 18:29:35 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. +Using custom data configuration default-cb85ddec01d455d4 +Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json +Generating dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-cb85ddec01d455d4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96) +Downloading and preparing dataset json/default to /home/hangyu5/.cache/huggingface/datasets/json/default-cb85ddec01d455d4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96... +Downloading took 0.0 min +Checksum Computation took 0.0 min +Generating train split + +Generating train split: 0 examples [00:00, ? examples/s] +Generating train split: 6640 examples [00:00, 69564.06 examples/s] +Unable to verify splits sizes. +Dataset json downloaded and prepared to /home/hangyu5/.cache/huggingface/datasets/json/default-cb85ddec01d455d4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data. +[INFO|tokenization_utils_base.py:2024] 2024-01-18 18:29:36,121 >> loading file tokenizer.model +[INFO|tokenization_utils_base.py:2024] 2024-01-18 18:29:36,121 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2024] 2024-01-18 18:29:36,121 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2024] 2024-01-18 18:29:36,121 >> loading file tokenizerYhyu13/LMCocktail-10.7B-v1 +[INFO|tokenization_utils_base.py:2024] 2024-01-18 18:29:36,121 >> loading file tokenizer.json +[INFO|configuration_Yhyu13/LMCocktail-10.7B-v129:36,160 >> loading configuration file ./models/LMCocktail-10.7B-v1/config.json +[INFO|configuration_utils.py:802] 2024-01-18 18:29:36,161 >> Model config LlamaConfig { + "_name_or_path": "./models/LMCocktail-10.7B-v1", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "pad_token_id": 2, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.36.2", + "use_cache": true, + "vocab_size": 32000 +} +Yhyu13/LMCocktail-10.7B-v1 +01/18/2024 18:29:36 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit. +[INFO|modeling_utils.py:3341] 2024-01-18 18:29:36,179 >> loading weights file ./models/LMCocktail-10.7B-v1/model.safetensors.index.json +[INFO|modeling_utils.py:1341] 2024-01-18 18:29:36,179 >> Instantiating LlamaForCausalLM model under default dtype torch.float16. +[INFO|configuration_utils.py:826] 2024-01-18 18:29:36,179 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +01/18/2024 18:29:36 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit. +[INFO|modeling_utils.py:3483] 2024-01-18 18:29:37,052 >> Detected 4-bit loading: activating 4-bit loading for this model + +Loading checkpoint shards: 0%| | 0/5 [00:00> All model checkpoint weights were used when initializing LlamaForCausalLM. + +[INFO|modeling_utils.py:4193] 2024-01-18 18:29:41,340 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at ./models/LMCocktail-10.7B-v1. +If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. +[INFO|configuration_utils.py:779] 2024-01-18 18:29:41,344 >> loading configuration file ./models/LMCocktail-10.7B-v1/generation_config.json +[INFO|configuration_utils.py:826] 2024-01-18 18:29:41,344 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "use_cache": false +} + +01/18/2024 18:29:41 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +01/18/2024 18:29:41 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +01/18/2024 18:29:41 - INFO - llmtuner.model.loader - trainable params: 5111808 || all params: 10736635904 || trainable%: 0.0476 +01/18/2024 18:29:41 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. +01/18/2024 18:29:41 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA +01/18/2024 18:29:41 - INFO - llmtuner.model.loader - trainable params: 5111808 || all params: 10736635904 || trainable%: 0.0476 + +Running tokenizer on dataset: 0%| | 0/6640 [00:00 ### User: +SYSTEM: You are a helpful assistant with access to the following functions. Use them if required - +{ + "name": "get_exchange_rate", + "description": "Get the exchange rate between two currencies", + "parameters": { + "type": "object", + "properties": { + "base_currency": { + "type": "string", + "description": "The currency to convert from" + }, + "target_currency": { + "type": "string", + "description": "The currency to convert to" + } + }, + "required": [ + "base_currency", + "target_currency" + ] + } +} + +Can you book a flight for me from New York to London? + +### Assistant: + I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask! +label_ids: +[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 315, 28742, 28719, 7371, 28725, 562, 315, 949, 28742, 28707, 506, 272, 21368, 298, 1820, 22447, 28723, 1984, 1868, 908, 5976, 528, 298, 625, 272, 8877, 4338, 1444, 989, 1191, 951, 20023, 28723, 1047, 368, 927, 1316, 395, 369, 28725, 1601, 1933, 298, 1460, 28808, 2] +labels: + I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask! +[INFO|training_args.py:1838] 2024-01-18 18:29:57,465 >> PyTorch: setting up devices + +Running tokenizer on dataset: 0%| | 0/6640 [00:00> ***** Running training ***** +[INFO|trainer.py:1707] 2024-01-18 18:30:12,809 >> Num examples = 5,975 +[INFO|trainer.py:1708] 2024-01-18 18:30:12,809 >> Num Epochs = 1 +[INFO|trainer.py:1709] 2024-01-18 18:30:12,809 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1712] 2024-01-18 18:30:12,809 >> Total train batch size (w. parallel, distributed & accumulation) = 8 +[INFO|trainer.py:1713] 2024-01-18 18:30:12,809 >> Gradient Accumulation steps = 4 +[INFO|trainer.py:1714] 2024-01-18 18:30:12,809 >> Total optimization steps = 747 +[INFO|trainer.py:1715] 2024-01-18 18:30:12,812 >> Number of trainable parameters = 5,111,808 +01/18/2024 18:30:14 - WARNING - llmtuner.extras.callbacks - Previous log file in this folder will be deleted. + + 0%| | 0/747 [00:00> ***** Running Evaluation ***** +[INFO|trainer.py:3168] 2024-01-18 19:16:11,445 >> Num examples = 664 +[INFO|trainer.py:3171] 2024-01-18 19:16:11,445 >> Batch size = 1 + + + 0%| | 0/332 [00:00> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + + +{'train_runtime': 2859.8545, 'train_samples_per_second': 2.089, 'train_steps_per_second': 0.261, 'train_loss': 0.3299662241814446, 'epoch': 1.0} + +100%|██████████| 747/747 [47:38<00:00, 3.69s/it] +100%|██████████| 747/747 [47:38<00:00, 3.83s/it] +[INFO|trainer.py:2889] 2024-01-18 19:17:52,669 >> Saving model checkpoint to ./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora +[INFO|tokenization_utils_base.py:2432] 2024-01-18 19:17:52,742 >> tokenizer config file saved in ./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora/tokenizer_config.json +[INFO|tokenization_utils_base.py:2441] 2024-01-18 19:17:52,742 >> Special tokens file saved in ./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora/special_tokens_map.json +***** train metrics ***** + epoch = 1.0 + train_loss = 0.33 + train_runtime = 0:47:39.85 + train_samples_per_second = 2.089 + train_steps_per_second = 0.261 +Figure saved: ./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora/training_loss.png +Figure saved: ./models/sft/LMCocktail-10.7B-v1-sft-glaive-function-calling-v2-ep1-lora/training_eval_loss.png +[INFO|trainer.py:3166] 2024-01-18 19:17:55,818 >> ***** Running Evaluation ***** +[INFO|trainer.py:3168] 2024-01-18 19:17:55,818 >> Num examples = 664 +[INFO|trainer.py:3171] 2024-01-18 19:17:55,818 >> Batch size = 1 + + 0%| | 0/332 [00:00> Dropping the following result as it does not have all the necessary fields: +{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}