|
01/04/2024 10:04:05 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. |
|
[INFO|training_args.py:1838] 2024-01-04 10:04:05,581 >> PyTorch: setting up devices |
|
/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of π€ Transformers. Use `--hub_token` instead. |
|
warnings.warn( |
|
01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1 |
|
distributed training: True, compute dtype: None |
|
01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments( |
|
_n_gpu=1, |
|
adafactor=False, |
|
adam_beta1=0.9, |
|
adam_beta2=0.999, |
|
adam_epsilon=1e-08, |
|
auto_find_batch_size=False, |
|
bf16=False, |
|
bf16_full_eval=False, |
|
data_seed=None, |
|
dataloader_drop_last=False, |
|
dataloader_num_workers=0, |
|
dataloader_persistent_workers=False, |
|
dataloader_pin_memory=True, |
|
ddp_backend=None, |
|
ddp_broadcast_buffers=None, |
|
ddp_bucket_cap_mb=None, |
|
ddp_find_unused_parameters=False, |
|
ddp_timeout=1800, |
|
debug=[], |
|
deepspeed=None, |
|
disable_tqdm=False, |
|
dispatch_batches=None, |
|
do_eval=False, |
|
do_predict=True, |
|
do_train=False, |
|
eval_accumulation_steps=None, |
|
eval_delay=0, |
|
eval_steps=None, |
|
evaluation_strategy=IntervalStrategy.NO, |
|
fp16=False, |
|
fp16_backend=auto, |
|
fp16_full_eval=False, |
|
fp16_opt_level=O1, |
|
fsdp=[], |
|
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, |
|
fsdp_min_num_params=0, |
|
fsdp_transformer_layer_cls_to_wrap=None, |
|
full_determinism=False, |
|
generation_config=None, |
|
generation_max_length=None, |
|
generation_num_beams=None, |
|
gradient_accumulation_steps=1, |
|
gradient_checkpointing=False, |
|
gradient_checkpointing_kwargs=None, |
|
greater_is_better=None, |
|
group_by_length=False, |
|
half_precision_backend=auto, |
|
hub_always_push=False, |
|
hub_model_id=None, |
|
hub_private_repo=False, |
|
hub_strategy=HubStrategy.EVERY_SAVE, |
|
hub_token=<HUB_TOKEN>, |
|
ignore_data_skip=False, |
|
include_inputs_for_metrics=False, |
|
include_num_input_tokens_seen=False, |
|
include_tokens_per_second=False, |
|
jit_mode_eval=False, |
|
label_names=None, |
|
label_smoothing_factor=0.0, |
|
learning_rate=5e-05, |
|
length_column_name=length, |
|
load_best_model_at_end=False, |
|
local_rank=0, |
|
log_level=passive, |
|
log_level_replica=warning, |
|
log_on_each_node=True, |
|
logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/runs/Jan04_10-04-05_yhyu13fuwuqi, |
|
logging_first_step=False, |
|
logging_nan_inf_filter=True, |
|
logging_steps=500, |
|
logging_strategy=IntervalStrategy.STEPS, |
|
lr_scheduler_kwargs={}, |
|
lr_scheduler_type=SchedulerType.LINEAR, |
|
max_grad_norm=1.0, |
|
max_steps=-1, |
|
metric_for_best_model=None, |
|
mp_parameters=, |
|
neftune_noise_alpha=None, |
|
no_cuda=False, |
|
num_train_epochs=3.0, |
|
optim=OptimizerNames.ADAMW_TORCH, |
|
optim_args=None, |
|
output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20, |
|
overwrite_output_dir=False, |
|
past_index=-1, |
|
per_device_eval_batch_size=1, |
|
per_device_train_batch_size=8, |
|
predict_with_generate=True, |
|
prediction_loss_only=False, |
|
push_to_hub=False, |
|
push_to_hub_model_id=None, |
|
push_to_hub_organization=None, |
|
push_to_hub_token=<PUSH_TO_HUB_TOKEN>, |
|
ray_scope=last, |
|
remove_unused_columns=True, |
|
report_to=['tensorboard'], |
|
resume_from_checkpoint=None, |
|
run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20, |
|
save_on_each_node=False, |
|
save_only_model=False, |
|
save_safetensors=True, |
|
save_steps=500, |
|
save_strategy=IntervalStrategy.STEPS, |
|
save_total_limit=None, |
|
seed=42, |
|
skip_memory_metrics=True, |
|
sortish_sampler=False, |
|
split_batches=False, |
|
tf32=None, |
|
torch_compile=False, |
|
torch_compile_backend=None, |
|
torch_compile_mode=None, |
|
torchdynamo=None, |
|
tpu_metrics_debug=False, |
|
tpu_num_cores=None, |
|
use_cpu=False, |
|
use_ipex=False, |
|
use_legacy_prediction_loop=False, |
|
use_mps_device=False, |
|
warmup_ratio=0.0, |
|
warmup_steps=0, |
|
weight_decay=0.0, |
|
) |
|
01/04/2024 10:04:05 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json... |
|
01/04/2024 10:04:05 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json. |
|
Using custom data configuration default-b024aadef2a1493c |
|
Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json |
|
Overwrite dataset info from restored data version if exists. |
|
Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96 |
|
Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96) |
|
Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96 |
|
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file vocab.json |
|
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file merges.txt |
|
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file added_tokens.json |
|
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file special_tokens_map.json |
|
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer.json |
|
[WARNING|logging.py:314] 2024-01-04 10:04:06,448 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
[INFO|configuration_utils.py:737] 2024-01-04 10:04:06,448 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json |
|
[INFO|configuration_utils.py:737] 2024-01-04 10:04:06,449 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json |
|
[INFO|configuration_utils.py:802] 2024-01-04 10:04:06,450 >> Model config PhiConfig { |
|
"_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2", |
|
"activation_function": "gelu_new", |
|
"architectures": [ |
|
"PhiForCausalLM" |
|
], |
|
"attn_pdrop": 0.0, |
|
"auto_map": { |
|
"AutoConfig": "configuration_phi.PhiConfig", |
|
"AutoModelForCausalLM": "modeling_phi.PhiForCausalLM" |
|
}, |
|
"embd_pdrop": 0.0, |
|
"flash_attn": false, |
|
"flash_rotary": false, |
|
"fused_dense": false, |
|
"img_processor": null, |
|
"initializer_range": 0.02, |
|
"layer_norm_epsilon": 1e-05, |
|
"model_type": "phi-msft", |
|
"n_embd": 2560, |
|
"n_head": 32, |
|
"n_head_kv": null, |
|
"n_inner": null, |
|
"n_layer": 32, |
|
"n_positions": 2048, |
|
"resid_pdrop": 0.1, |
|
"rotary_dim": 32, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float16", |
|
"transformers_version": "4.36.2", |
|
"use_cache": false, |
|
"vocab_size": 51200 |
|
} |
|
|
|
[INFO|modeling_utils.py:3341] 2024-01-04 10:04:06,482 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json |
|
[INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig { |
|
"use_cache": false |
|
} |
|
|
|
[INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig { |
|
"use_cache": false |
|
} |
|
|
|
|
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2 |
|
Loading checkpoint shards: 50%|βββββ | 1/2 [00:00<00:00, 1.41it/s] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:00<00:00, 2.34it/s] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:00<00:00, 2.13it/s]cognitivecomputations/dolphin-2_6-phi-2 |
|
[WARNING|modeling_utils.py:4175] 2024-01-04 10:04:07,704 >> Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight'] |
|
- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a modelcognitivecomputations/dolphin-2_6-phi-2r with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). |
|
- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). |
|
[INFO|modeling_utils.py:4193] 2024-01-04 10:04:07,704 >> All the weights of PhiForCausalLM were initialized from the model checkpoint at ./models/dolphin-2_6-phi-2. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training. |
|
[INFO|configuration_utils.py:779] 2024-01-04 10:04:07,707 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json |
|
[INFO|configuration_utils.py:826] 2024-01-04 10:04:07,707 >> Generate config GenerationConfig {} |
|
|
|
01/04/2024 10:04:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA |
|
01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Merged 1 adapter(s). |
|
01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Loaded adapter(s): ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora |
|
01/04/2024 10:04:09 - INFO - llmtuner.model.loader - trainable params: 0 || all params: 2779683840 || trainable%: 0.0000 |
|
01/04/2024 10:04:09 - INFO - llmtuner.model.loader - This IS expected that the trainable params is 0 if you are using model for inference only. |
|
|
|
Running tokenizer on dataset: 0%| | 0/20 [00:00<?, ? examples/s]Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c7eb5697298b6539.arrow |
|
|
|
Running tokenizer on dataset: 100%|ββββββββββ| 20/20 [00:00<00:00, 360.26 examples/s] |
|
[INFO|training_args.py:1838] 2024-01-04 10:04:09,995 >> PyTorch: setting up devices |
|
[INFO|trainer.py:3166] 2024-01-04 10:04:10,639 >> ***** Running Prediction ***** |
|
[INFO|trainer.py:3168] 2024-01-04 10:04:10,639 >> Num examples = 20 |
|
[INFO|trainer.py:3171] 2024-01-04 10:04:10,639 >> Batch size = 1 |
|
[INFO|configuration_utils.py:826] 2024-01-04 10:04:10,651 >> Generate config GenerationConfig { |
|
"use_cache": false |
|
} |
|
|
|
/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/generation/utils.py:1518: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration ) |
|
warnings.warn( |
|
input_ids: |
|
[32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25] |
|
inputs: |
|
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. |
|
Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required - |
|
{ |
|
"name": "get_exchange_rate", |
|
"description": "Get the exchange rate between two currencies", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"base_currency": { |
|
"type": "string", |
|
"description": "The currency to convert from" |
|
}, |
|
"target_currency": { |
|
"type": "string", |
|
"description": "The currency to convert to" |
|
} |
|
}, |
|
"required": [ |
|
"base_currency", |
|
"target_currency" |
|
] |
|
} |
|
} |
|
|
|
Can you book a flight for me from New York to London? |
|
Assistant: |
|
|
|
0%| | 0/20 [00:00<?, ?it/s] |
|
10%|β | 2/20 [00:01<00:11, 1.56it/s] |
|
15%|ββ | 3/20 [00:03<00:19, 1.15s/it] |
|
20%|ββ | 4/20 [00:04<00:20, 1.26s/it] |
|
25%|βββ | 5/20 [00:07<00:25, 1.67s/it] |
|
30%|βββ | 6/20 [00:07<00:19, 1.38s/it] |
|
35%|ββββ | 7/20 [00:09<00:18, 1.45s/it] |
|
40%|ββββ | 8/20 [00:10<00:17, 1.47s/it] |
|
45%|βββββ | 9/20 [00:12<00:15, 1.39s/it] |
|
50%|βββββ | 10/20 [00:13<00:15, 1.51s/it] |
|
55%|ββββββ | 11/20 [00:14<00:11, 1.29s/it] |
|
60%|ββββββ | 12/20 [00:15<00:09, 1.13s/it] |
|
65%|βββββββ | 13/20 [00:17<00:09, 1.35s/it] |
|
70%|βββββββ | 14/20 [00:18<00:07, 1.29s/it] |
|
75%|ββββββββ | 15/20 [00:20<00:07, 1.49s/it] |
|
80%|ββββββββ | 16/20 [00:21<00:04, 1.25s/it] |
|
85%|βββββββββ | 17/20 [00:22<00:03, 1.17s/it] |
|
90%|βββββββββ | 18/20 [00:24<00:02, 1.40s/it] |
|
95%|ββββββββββ| 19/20 [00:24<00:01, 1.14s/it] |
|
100%|ββββββββββ| 20/20 [00:26<00:00, 1.34s/it]Building prefix dict from the default dictionary ... |
|
Loading model from cache /tmp/jieba.cache |
|
Loading model cost 0.697 seconds. |
|
Prefix dict has been built successfully. |
|
|
|
100%|ββββββββββ| 20/20 [00:27<00:00, 1.36s/it] |
|
***** predict metrics ***** |
|
predict_bleu-4 = 74.1969 |
|
predict_rouge-1 = 80.3725 |
|
predict_rouge-2 = 70.3403 |
|
predict_rouge-l = 77.7235 |
|
predict_runtime = 0:00:29.41 |
|
predict_samples_per_second = 0.68 |
|
predict_steps_per_second = 0.68 |
|
01/04/2024 10:04:40 - INFO - llmtuner.train.sft.trainer - Saving prediction results to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/generated_predictions.jsonl |
|
|