yhyu13
commited on
Commit
•
edc5215
1
Parent(s):
405aa43
Upload
Browse files- Predict_20/all_results.json +9 -0
- Predict_20/generated_predictions.jsonl +20 -0
- Predict_20/predict_log.txt +286 -0
- Predict_20/predict_results.json +9 -0
- adapter_config.json +25 -0
- adapter_model.safetensors +3 -0
- added_tokens.json +42 -0
- all_results.json +11 -0
- eval_results.json +7 -0
- merges.txt +0 -0
- special_tokens_map.json +30 -0
- tokenizer_config.json +345 -0
- train_eval_log.txt +1479 -0
- train_results.json +7 -0
- trainer_log.jsonl +40 -0
- trainer_state.json +260 -0
- training_args.bin +3 -0
- training_eval_loss.png +0 -0
- training_loss.png +0 -0
- vocab.json +0 -0
Predict_20/all_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"predict_bleu-4": 74.196885,
|
3 |
+
"predict_rouge-1": 80.372465,
|
4 |
+
"predict_rouge-2": 70.340295,
|
5 |
+
"predict_rouge-l": 77.72346,
|
6 |
+
"predict_runtime": 29.4113,
|
7 |
+
"predict_samples_per_second": 0.68,
|
8 |
+
"predict_steps_per_second": 0.68
|
9 |
+
}
|
Predict_20/generated_predictions.jsonl
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"label": "I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!", "predict": "I'm sorry, but I'm unable to perform tasks outside of my defined functions. As of now, I can only help with tasks like getting the exchange rate between two currencies."}
|
2 |
+
{"label": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"United States\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"United States\"}'} </functioncall>"}
|
3 |
+
{"label": "Here are the latest news headlines for the United States:\n1. Biden announces new vaccine mandates\n2. Hurricane Ida devastates Louisiana\n3. Apple unveils new iPhone\n4. NASA's Perseverance rover collects first Mars rock sample", "predict": "The latest news headlines for the United States are: Biden announces new vaccine mandates, Hurricane Ida devastates Louisiana, Apple unveils new iPhone, and NASA's Perseverance rover collects first Mars rock sample."}
|
4 |
+
{"label": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"France\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": '{\"country\": \"France\"}'} </functioncall>"}
|
5 |
+
{"label": "Here are the latest news headlines for France:\n1. France recalls ambassadors to US and Australia\n2. French election: Macron's party braces for tough fight\n3. Louvre Museum to undergo major overhaul\n4. France to offer free birth control to all women under 25", "predict": "The latest news headlines for France are:\n1. France recalls ambassadors to US and Australia\n2. French election: Macron's party braces for tough fight\n3. Louvre Museum to undergo major overhaul\n4. France to offer free birth control to all women under 25"}
|
6 |
+
{"label": "Of course. How long would you like your password to be? And would you like it to include symbols?", "predict": "Of course, I can help with that. How long would you like your password to be?"}
|
7 |
+
{"label": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_symbols\": true}'} </functioncall>", "predict": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_symbols\": true}'} </functioncall>"}
|
8 |
+
{"label": "Here is your new password: 4&7j#9@1Q6*. Please make sure to save it in a secure location.", "predict": "Here is your new password. It is 4&7j#9@1Q6* and is 12 characters long. It includes symbols as per your request."}
|
9 |
+
{"label": "Of course! How long would you like your password to be? And do you want it to include numbers and special characters?", "predict": "Of course! I can help with that. How long would you like the password to be? And do you want it to include numbers or special characters?"}
|
10 |
+
{"label": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_numbers\": true, \"include_special_characters\": true}'} </functioncall>", "predict": "<functioncall> {\"name\": \"generate_password\", \"arguments\": '{\"length\": 12, \"include_numbers\": true, \"include_special_characters\": true}'} </functioncall>"}
|
11 |
+
{"label": "Here is your new password: 4#7gB6&9L1!0. Please make sure to save it in a secure place.", "predict": "Here is your new password: 4#7gB6&9L1!0"}
|
12 |
+
{"label": "Of course, I can help you with that. Please provide me with the list of numbers.", "predict": "Of course, I can help you with that. Please provide me with the list of numbers."}
|
13 |
+
{"label": "<functioncall> {\"name\": \"calculate_median\", \"arguments\": '{\"numbers\": [5, 2, 9, 1, 7, 4, 6, 3, 8]}'} </functioncall>", "predict": "<functioncall> {\"name\": \"calculate_median\", \"arguments\": '{\"numbers\": [5, 2, 9, 1, 7, 4, 6, 3, 8]}'} </functioncall>"}
|
14 |
+
{"label": "The median of the provided list of numbers is 5.", "predict": "The median of the list of numbers [5, 2, 9, 1, 7, 4, 6, 3, 8] is 5."}
|
15 |
+
{"label": "Of course, I can help you with that. Let me calculate it for you.\n<functioncall> {\"name\": \"calculate_loan_payment\", \"arguments\": '{\"principal\": 50000, \"interest_rate\": 5, \"loan_term\": 10}'} </functioncall>", "predict": "<functioncall> {\"name\": \"calculate_loan_payment\", \"arguments\": '{\"principal\": 50000, \"interest_rate\": 5, \"loan_term\": 10}'} </functioncall>"}
|
16 |
+
{"label": "Based on the information provided, your monthly loan payment would be $530.33.", "predict": "According to the calculations, your monthly loan payment would be $530.33."}
|
17 |
+
{"label": "I'm sorry, but as an AI, I don't have the capability to book flights or perform external tasks. My current function is to calculate loan payments. I recommend using a travel booking website or app for your flight booking needs.", "predict": "I'm sorry, but I'm unable to assist with booking flights. My current capabilities are limited to calculating loan payments."}
|
18 |
+
{"label": "<functioncall> {\"name\": \"convert_temperature\", \"arguments\": '{\"temperature\": 30, \"from_unit\": \"Celsius\", \"to_unit\": \"Fahrenheit\"}'} </functioncall>", "predict": "<functioncall> {\"name\": \"convert_temperature\", \"arguments\": '{\"temperature\": 30, \"from_unit\": \"Celsius\", \"to_unit\": \"Fahrenheit\"}'} </functioncall>"}
|
19 |
+
{"label": "The converted temperature from 30 degrees Celsius to Fahrenheit is 86 degrees Fahrenheit.", "predict": "The temperature 30 degrees Celsius is equivalent to 86 degrees Fahrenheit."}
|
20 |
+
{"label": "I'm sorry, but I can't assist with that. My current capabilities allow me to provide information about movies based on their titles. I don't have the ability to place orders or perform external tasks.", "predict": "I'm sorry, but I'm unable to perform external tasks like ordering a pizza. My current capabilities are limited to providing information and answering questions. If you have any other questions or need assistance with something else, feel free to ask."}
|
Predict_20/predict_log.txt
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
01/04/2024 10:04:05 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
|
2 |
+
[INFO|training_args.py:1838] 2024-01-04 10:04:05,581 >> PyTorch: setting up devices
|
3 |
+
/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
|
4 |
+
warnings.warn(
|
5 |
+
01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1
|
6 |
+
distributed training: True, compute dtype: None
|
7 |
+
01/04/2024 10:04:05 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
|
8 |
+
_n_gpu=1,
|
9 |
+
adafactor=False,
|
10 |
+
adam_beta1=0.9,
|
11 |
+
adam_beta2=0.999,
|
12 |
+
adam_epsilon=1e-08,
|
13 |
+
auto_find_batch_size=False,
|
14 |
+
bf16=False,
|
15 |
+
bf16_full_eval=False,
|
16 |
+
data_seed=None,
|
17 |
+
dataloader_drop_last=False,
|
18 |
+
dataloader_num_workers=0,
|
19 |
+
dataloader_persistent_workers=False,
|
20 |
+
dataloader_pin_memory=True,
|
21 |
+
ddp_backend=None,
|
22 |
+
ddp_broadcast_buffers=None,
|
23 |
+
ddp_bucket_cap_mb=None,
|
24 |
+
ddp_find_unused_parameters=False,
|
25 |
+
ddp_timeout=1800,
|
26 |
+
debug=[],
|
27 |
+
deepspeed=None,
|
28 |
+
disable_tqdm=False,
|
29 |
+
dispatch_batches=None,
|
30 |
+
do_eval=False,
|
31 |
+
do_predict=True,
|
32 |
+
do_train=False,
|
33 |
+
eval_accumulation_steps=None,
|
34 |
+
eval_delay=0,
|
35 |
+
eval_steps=None,
|
36 |
+
evaluation_strategy=IntervalStrategy.NO,
|
37 |
+
fp16=False,
|
38 |
+
fp16_backend=auto,
|
39 |
+
fp16_full_eval=False,
|
40 |
+
fp16_opt_level=O1,
|
41 |
+
fsdp=[],
|
42 |
+
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
|
43 |
+
fsdp_min_num_params=0,
|
44 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
45 |
+
full_determinism=False,
|
46 |
+
generation_config=None,
|
47 |
+
generation_max_length=None,
|
48 |
+
generation_num_beams=None,
|
49 |
+
gradient_accumulation_steps=1,
|
50 |
+
gradient_checkpointing=False,
|
51 |
+
gradient_checkpointing_kwargs=None,
|
52 |
+
greater_is_better=None,
|
53 |
+
group_by_length=False,
|
54 |
+
half_precision_backend=auto,
|
55 |
+
hub_always_push=False,
|
56 |
+
hub_model_id=None,
|
57 |
+
hub_private_repo=False,
|
58 |
+
hub_strategy=HubStrategy.EVERY_SAVE,
|
59 |
+
hub_token=<HUB_TOKEN>,
|
60 |
+
ignore_data_skip=False,
|
61 |
+
include_inputs_for_metrics=False,
|
62 |
+
include_num_input_tokens_seen=False,
|
63 |
+
include_tokens_per_second=False,
|
64 |
+
jit_mode_eval=False,
|
65 |
+
label_names=None,
|
66 |
+
label_smoothing_factor=0.0,
|
67 |
+
learning_rate=5e-05,
|
68 |
+
length_column_name=length,
|
69 |
+
load_best_model_at_end=False,
|
70 |
+
local_rank=0,
|
71 |
+
log_level=passive,
|
72 |
+
log_level_replica=warning,
|
73 |
+
log_on_each_node=True,
|
74 |
+
logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/runs/Jan04_10-04-05_yhyu13fuwuqi,
|
75 |
+
logging_first_step=False,
|
76 |
+
logging_nan_inf_filter=True,
|
77 |
+
logging_steps=500,
|
78 |
+
logging_strategy=IntervalStrategy.STEPS,
|
79 |
+
lr_scheduler_kwargs={},
|
80 |
+
lr_scheduler_type=SchedulerType.LINEAR,
|
81 |
+
max_grad_norm=1.0,
|
82 |
+
max_steps=-1,
|
83 |
+
metric_for_best_model=None,
|
84 |
+
mp_parameters=,
|
85 |
+
neftune_noise_alpha=None,
|
86 |
+
no_cuda=False,
|
87 |
+
num_train_epochs=3.0,
|
88 |
+
optim=OptimizerNames.ADAMW_TORCH,
|
89 |
+
optim_args=None,
|
90 |
+
output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20,
|
91 |
+
overwrite_output_dir=False,
|
92 |
+
past_index=-1,
|
93 |
+
per_device_eval_batch_size=1,
|
94 |
+
per_device_train_batch_size=8,
|
95 |
+
predict_with_generate=True,
|
96 |
+
prediction_loss_only=False,
|
97 |
+
push_to_hub=False,
|
98 |
+
push_to_hub_model_id=None,
|
99 |
+
push_to_hub_organization=None,
|
100 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
101 |
+
ray_scope=last,
|
102 |
+
remove_unused_columns=True,
|
103 |
+
report_to=['tensorboard'],
|
104 |
+
resume_from_checkpoint=None,
|
105 |
+
run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20,
|
106 |
+
save_on_each_node=False,
|
107 |
+
save_only_model=False,
|
108 |
+
save_safetensors=True,
|
109 |
+
save_steps=500,
|
110 |
+
save_strategy=IntervalStrategy.STEPS,
|
111 |
+
save_total_limit=None,
|
112 |
+
seed=42,
|
113 |
+
skip_memory_metrics=True,
|
114 |
+
sortish_sampler=False,
|
115 |
+
split_batches=False,
|
116 |
+
tf32=None,
|
117 |
+
torch_compile=False,
|
118 |
+
torch_compile_backend=None,
|
119 |
+
torch_compile_mode=None,
|
120 |
+
torchdynamo=None,
|
121 |
+
tpu_metrics_debug=False,
|
122 |
+
tpu_num_cores=None,
|
123 |
+
use_cpu=False,
|
124 |
+
use_ipex=False,
|
125 |
+
use_legacy_prediction_loop=False,
|
126 |
+
use_mps_device=False,
|
127 |
+
warmup_ratio=0.0,
|
128 |
+
warmup_steps=0,
|
129 |
+
weight_decay=0.0,
|
130 |
+
)
|
131 |
+
01/04/2024 10:04:05 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
|
132 |
+
01/04/2024 10:04:05 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
|
133 |
+
Using custom data configuration default-b024aadef2a1493c
|
134 |
+
Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json
|
135 |
+
Overwrite dataset info from restored data version if exists.
|
136 |
+
Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
|
137 |
+
Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
|
138 |
+
Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
|
139 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file vocab.json
|
140 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file merges.txt
|
141 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file added_tokens.json
|
142 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file special_tokens_map.json
|
143 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer_config.json
|
144 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 10:04:06,381 >> loading file tokenizer.json
|
145 |
+
[WARNING|logging.py:314] 2024-01-04 10:04:06,448 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
146 |
+
[INFO|configuration_utils.py:737] 2024-01-04 10:04:06,448 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
|
147 |
+
[INFO|configuration_utils.py:737] 2024-01-04 10:04:06,449 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
|
148 |
+
[INFO|configuration_utils.py:802] 2024-01-04 10:04:06,450 >> Model config PhiConfig {
|
149 |
+
"_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
|
150 |
+
"activation_function": "gelu_new",
|
151 |
+
"architectures": [
|
152 |
+
"PhiForCausalLM"
|
153 |
+
],
|
154 |
+
"attn_pdrop": 0.0,
|
155 |
+
"auto_map": {
|
156 |
+
"AutoConfig": "configuration_phi.PhiConfig",
|
157 |
+
"AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
|
158 |
+
},
|
159 |
+
"embd_pdrop": 0.0,
|
160 |
+
"flash_attn": false,
|
161 |
+
"flash_rotary": false,
|
162 |
+
"fused_dense": false,
|
163 |
+
"img_processor": null,
|
164 |
+
"initializer_range": 0.02,
|
165 |
+
"layer_norm_epsilon": 1e-05,
|
166 |
+
"model_type": "phi-msft",
|
167 |
+
"n_embd": 2560,
|
168 |
+
"n_head": 32,
|
169 |
+
"n_head_kv": null,
|
170 |
+
"n_inner": null,
|
171 |
+
"n_layer": 32,
|
172 |
+
"n_positions": 2048,
|
173 |
+
"resid_pdrop": 0.1,
|
174 |
+
"rotary_dim": 32,
|
175 |
+
"tie_word_embeddings": false,
|
176 |
+
"torch_dtype": "float16",
|
177 |
+
"transformers_version": "4.36.2",
|
178 |
+
"use_cache": false,
|
179 |
+
"vocab_size": 51200
|
180 |
+
}
|
181 |
+
|
182 |
+
[INFO|modeling_utils.py:3341] 2024-01-04 10:04:06,482 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json
|
183 |
+
[INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig {
|
184 |
+
"use_cache": false
|
185 |
+
}
|
186 |
+
|
187 |
+
[INFO|configuration_utils.py:826] 2024-01-04 10:04:06,483 >> Generate config GenerationConfig {
|
188 |
+
"use_cache": false
|
189 |
+
}
|
190 |
+
|
191 |
+
|
192 |
+
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2
|
193 |
+
Loading checkpoint shards: 50%|█████ | 1/2 [00:00<00:00, 1.41it/s]
|
194 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 2.34it/s]
|
195 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 2.13it/s]cognitivecomputations/dolphin-2_6-phi-2
|
196 |
+
[WARNING|modeling_utils.py:4175] 2024-01-04 10:04:07,704 >> Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
|
197 |
+
- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a modelcognitivecomputations/dolphin-2_6-phi-2r with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
|
198 |
+
- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
|
199 |
+
[INFO|modeling_utils.py:4193] 2024-01-04 10:04:07,704 >> All the weights of PhiForCausalLM were initialized from the model checkpoint at ./models/dolphin-2_6-phi-2.
|
200 |
+
If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
|
201 |
+
[INFO|configuration_utils.py:779] 2024-01-04 10:04:07,707 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json
|
202 |
+
[INFO|configuration_utils.py:826] 2024-01-04 10:04:07,707 >> Generate config GenerationConfig {}
|
203 |
+
|
204 |
+
01/04/2024 10:04:08 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
|
205 |
+
01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Merged 1 adapter(s).
|
206 |
+
01/04/2024 10:04:09 - INFO - llmtuner.model.adapter - Loaded adapter(s): ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora
|
207 |
+
01/04/2024 10:04:09 - INFO - llmtuner.model.loader - trainable params: 0 || all params: 2779683840 || trainable%: 0.0000
|
208 |
+
01/04/2024 10:04:09 - INFO - llmtuner.model.loader - This IS expected that the trainable params is 0 if you are using model for inference only.
|
209 |
+
|
210 |
+
Running tokenizer on dataset: 0%| | 0/20 [00:00<?, ? examples/s]Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c7eb5697298b6539.arrow
|
211 |
+
|
212 |
+
Running tokenizer on dataset: 100%|██████████| 20/20 [00:00<00:00, 360.26 examples/s]
|
213 |
+
[INFO|training_args.py:1838] 2024-01-04 10:04:09,995 >> PyTorch: setting up devices
|
214 |
+
[INFO|trainer.py:3166] 2024-01-04 10:04:10,639 >> ***** Running Prediction *****
|
215 |
+
[INFO|trainer.py:3168] 2024-01-04 10:04:10,639 >> Num examples = 20
|
216 |
+
[INFO|trainer.py:3171] 2024-01-04 10:04:10,639 >> Batch size = 1
|
217 |
+
[INFO|configuration_utils.py:826] 2024-01-04 10:04:10,651 >> Generate config GenerationConfig {
|
218 |
+
"use_cache": false
|
219 |
+
}
|
220 |
+
|
221 |
+
/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/generation/utils.py:1518: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )
|
222 |
+
warnings.warn(
|
223 |
+
input_ids:
|
224 |
+
[32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25]
|
225 |
+
inputs:
|
226 |
+
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
|
227 |
+
Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -
|
228 |
+
{
|
229 |
+
"name": "get_exchange_rate",
|
230 |
+
"description": "Get the exchange rate between two currencies",
|
231 |
+
"parameters": {
|
232 |
+
"type": "object",
|
233 |
+
"properties": {
|
234 |
+
"base_currency": {
|
235 |
+
"type": "string",
|
236 |
+
"description": "The currency to convert from"
|
237 |
+
},
|
238 |
+
"target_currency": {
|
239 |
+
"type": "string",
|
240 |
+
"description": "The currency to convert to"
|
241 |
+
}
|
242 |
+
},
|
243 |
+
"required": [
|
244 |
+
"base_currency",
|
245 |
+
"target_currency"
|
246 |
+
]
|
247 |
+
}
|
248 |
+
}
|
249 |
+
|
250 |
+
Can you book a flight for me from New York to London?
|
251 |
+
Assistant:
|
252 |
+
|
253 |
+
0%| | 0/20 [00:00<?, ?it/s]
|
254 |
+
10%|█ | 2/20 [00:01<00:11, 1.56it/s]
|
255 |
+
15%|█▌ | 3/20 [00:03<00:19, 1.15s/it]
|
256 |
+
20%|██ | 4/20 [00:04<00:20, 1.26s/it]
|
257 |
+
25%|██▌ | 5/20 [00:07<00:25, 1.67s/it]
|
258 |
+
30%|███ | 6/20 [00:07<00:19, 1.38s/it]
|
259 |
+
35%|███▌ | 7/20 [00:09<00:18, 1.45s/it]
|
260 |
+
40%|████ | 8/20 [00:10<00:17, 1.47s/it]
|
261 |
+
45%|████▌ | 9/20 [00:12<00:15, 1.39s/it]
|
262 |
+
50%|█████ | 10/20 [00:13<00:15, 1.51s/it]
|
263 |
+
55%|█████▌ | 11/20 [00:14<00:11, 1.29s/it]
|
264 |
+
60%|██████ | 12/20 [00:15<00:09, 1.13s/it]
|
265 |
+
65%|██████▌ | 13/20 [00:17<00:09, 1.35s/it]
|
266 |
+
70%|███████ | 14/20 [00:18<00:07, 1.29s/it]
|
267 |
+
75%|███████▌ | 15/20 [00:20<00:07, 1.49s/it]
|
268 |
+
80%|████████ | 16/20 [00:21<00:04, 1.25s/it]
|
269 |
+
85%|████████▌ | 17/20 [00:22<00:03, 1.17s/it]
|
270 |
+
90%|█████████ | 18/20 [00:24<00:02, 1.40s/it]
|
271 |
+
95%|█████████▌| 19/20 [00:24<00:01, 1.14s/it]
|
272 |
+
100%|██████████| 20/20 [00:26<00:00, 1.34s/it]Building prefix dict from the default dictionary ...
|
273 |
+
Loading model from cache /tmp/jieba.cache
|
274 |
+
Loading model cost 0.697 seconds.
|
275 |
+
Prefix dict has been built successfully.
|
276 |
+
|
277 |
+
100%|██████████| 20/20 [00:27<00:00, 1.36s/it]
|
278 |
+
***** predict metrics *****
|
279 |
+
predict_bleu-4 = 74.1969
|
280 |
+
predict_rouge-1 = 80.3725
|
281 |
+
predict_rouge-2 = 70.3403
|
282 |
+
predict_rouge-l = 77.7235
|
283 |
+
predict_runtime = 0:00:29.41
|
284 |
+
predict_samples_per_second = 0.68
|
285 |
+
predict_steps_per_second = 0.68
|
286 |
+
01/04/2024 10:04:40 - INFO - llmtuner.train.sft.trainer - Saving prediction results to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/Predict_20/generated_predictions.jsonl
|
Predict_20/predict_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"predict_bleu-4": 74.196885,
|
3 |
+
"predict_rouge-1": 80.372465,
|
4 |
+
"predict_rouge-2": 70.340295,
|
5 |
+
"predict_rouge-l": 77.72346,
|
6 |
+
"predict_runtime": 29.4113,
|
7 |
+
"predict_samples_per_second": 0.68,
|
8 |
+
"predict_steps_per_second": 0.68
|
9 |
+
}
|
adapter_config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"loftq_config": {},
|
12 |
+
"lora_alpha": 16,
|
13 |
+
"lora_dropout": 0.0,
|
14 |
+
"megatron_config": null,
|
15 |
+
"megatron_core": "megatron.core",
|
16 |
+
"modules_to_save": null,
|
17 |
+
"peft_type": "LORA",
|
18 |
+
"r": 8,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"Wqkv"
|
23 |
+
],
|
24 |
+
"task_type": "CAUSAL_LM"
|
25 |
+
}
|
adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3068eae74f7a7a2dcd863a7d976b25c9a4a719bf7cef9e3ae42edd45c5c482fb
|
3 |
+
size 10493896
|
added_tokens.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"\t\t": 50294,
|
3 |
+
"\t\t\t": 50293,
|
4 |
+
"\t\t\t\t": 50292,
|
5 |
+
"\t\t\t\t\t": 50291,
|
6 |
+
"\t\t\t\t\t\t": 50290,
|
7 |
+
"\t\t\t\t\t\t\t": 50289,
|
8 |
+
"\t\t\t\t\t\t\t\t": 50288,
|
9 |
+
"\t\t\t\t\t\t\t\t\t": 50287,
|
10 |
+
" ": 50286,
|
11 |
+
" ": 50285,
|
12 |
+
" ": 50284,
|
13 |
+
" ": 50283,
|
14 |
+
" ": 50282,
|
15 |
+
" ": 50281,
|
16 |
+
" ": 50280,
|
17 |
+
" ": 50279,
|
18 |
+
" ": 50278,
|
19 |
+
" ": 50277,
|
20 |
+
" ": 50276,
|
21 |
+
" ": 50275,
|
22 |
+
" ": 50274,
|
23 |
+
" ": 50273,
|
24 |
+
" ": 50272,
|
25 |
+
" ": 50271,
|
26 |
+
" ": 50270,
|
27 |
+
" ": 50269,
|
28 |
+
" ": 50268,
|
29 |
+
" ": 50267,
|
30 |
+
" ": 50266,
|
31 |
+
" ": 50265,
|
32 |
+
" ": 50264,
|
33 |
+
" ": 50263,
|
34 |
+
" ": 50262,
|
35 |
+
" ": 50261,
|
36 |
+
" ": 50260,
|
37 |
+
" ": 50259,
|
38 |
+
" ": 50258,
|
39 |
+
" ": 50257,
|
40 |
+
"<|im_end|>": 50295,
|
41 |
+
"<|im_start|>": 50296
|
42 |
+
}
|
all_results.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"eval_loss": 0.35242682695388794,
|
4 |
+
"eval_runtime": 28.2489,
|
5 |
+
"eval_samples_per_second": 11.859,
|
6 |
+
"eval_steps_per_second": 5.947,
|
7 |
+
"train_loss": 0.4441075046011742,
|
8 |
+
"train_runtime": 553.4721,
|
9 |
+
"train_samples_per_second": 5.44,
|
10 |
+
"train_steps_per_second": 0.679
|
11 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"eval_loss": 0.35242682695388794,
|
4 |
+
"eval_runtime": 28.2489,
|
5 |
+
"eval_samples_per_second": 11.859,
|
6 |
+
"eval_steps_per_second": 5.947
|
7 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|endoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|im_end|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<|endoftext|>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"50256": {
|
6 |
+
"content": "<|endoftext|>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"50257": {
|
14 |
+
"content": " ",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": false
|
20 |
+
},
|
21 |
+
"50258": {
|
22 |
+
"content": " ",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": false
|
28 |
+
},
|
29 |
+
"50259": {
|
30 |
+
"content": " ",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": true,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": false
|
36 |
+
},
|
37 |
+
"50260": {
|
38 |
+
"content": " ",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": true,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": false
|
44 |
+
},
|
45 |
+
"50261": {
|
46 |
+
"content": " ",
|
47 |
+
"lstrip": false,
|
48 |
+
"normalized": true,
|
49 |
+
"rstrip": false,
|
50 |
+
"single_word": false,
|
51 |
+
"special": false
|
52 |
+
},
|
53 |
+
"50262": {
|
54 |
+
"content": " ",
|
55 |
+
"lstrip": false,
|
56 |
+
"normalized": true,
|
57 |
+
"rstrip": false,
|
58 |
+
"single_word": false,
|
59 |
+
"special": false
|
60 |
+
},
|
61 |
+
"50263": {
|
62 |
+
"content": " ",
|
63 |
+
"lstrip": false,
|
64 |
+
"normalized": true,
|
65 |
+
"rstrip": false,
|
66 |
+
"single_word": false,
|
67 |
+
"special": false
|
68 |
+
},
|
69 |
+
"50264": {
|
70 |
+
"content": " ",
|
71 |
+
"lstrip": false,
|
72 |
+
"normalized": true,
|
73 |
+
"rstrip": false,
|
74 |
+
"single_word": false,
|
75 |
+
"special": false
|
76 |
+
},
|
77 |
+
"50265": {
|
78 |
+
"content": " ",
|
79 |
+
"lstrip": false,
|
80 |
+
"normalized": true,
|
81 |
+
"rstrip": false,
|
82 |
+
"single_word": false,
|
83 |
+
"special": false
|
84 |
+
},
|
85 |
+
"50266": {
|
86 |
+
"content": " ",
|
87 |
+
"lstrip": false,
|
88 |
+
"normalized": true,
|
89 |
+
"rstrip": false,
|
90 |
+
"single_word": false,
|
91 |
+
"special": false
|
92 |
+
},
|
93 |
+
"50267": {
|
94 |
+
"content": " ",
|
95 |
+
"lstrip": false,
|
96 |
+
"normalized": true,
|
97 |
+
"rstrip": false,
|
98 |
+
"single_word": false,
|
99 |
+
"special": false
|
100 |
+
},
|
101 |
+
"50268": {
|
102 |
+
"content": " ",
|
103 |
+
"lstrip": false,
|
104 |
+
"normalized": true,
|
105 |
+
"rstrip": false,
|
106 |
+
"single_word": false,
|
107 |
+
"special": false
|
108 |
+
},
|
109 |
+
"50269": {
|
110 |
+
"content": " ",
|
111 |
+
"lstrip": false,
|
112 |
+
"normalized": true,
|
113 |
+
"rstrip": false,
|
114 |
+
"single_word": false,
|
115 |
+
"special": false
|
116 |
+
},
|
117 |
+
"50270": {
|
118 |
+
"content": " ",
|
119 |
+
"lstrip": false,
|
120 |
+
"normalized": true,
|
121 |
+
"rstrip": false,
|
122 |
+
"single_word": false,
|
123 |
+
"special": false
|
124 |
+
},
|
125 |
+
"50271": {
|
126 |
+
"content": " ",
|
127 |
+
"lstrip": false,
|
128 |
+
"normalized": true,
|
129 |
+
"rstrip": false,
|
130 |
+
"single_word": false,
|
131 |
+
"special": false
|
132 |
+
},
|
133 |
+
"50272": {
|
134 |
+
"content": " ",
|
135 |
+
"lstrip": false,
|
136 |
+
"normalized": true,
|
137 |
+
"rstrip": false,
|
138 |
+
"single_word": false,
|
139 |
+
"special": false
|
140 |
+
},
|
141 |
+
"50273": {
|
142 |
+
"content": " ",
|
143 |
+
"lstrip": false,
|
144 |
+
"normalized": true,
|
145 |
+
"rstrip": false,
|
146 |
+
"single_word": false,
|
147 |
+
"special": false
|
148 |
+
},
|
149 |
+
"50274": {
|
150 |
+
"content": " ",
|
151 |
+
"lstrip": false,
|
152 |
+
"normalized": true,
|
153 |
+
"rstrip": false,
|
154 |
+
"single_word": false,
|
155 |
+
"special": false
|
156 |
+
},
|
157 |
+
"50275": {
|
158 |
+
"content": " ",
|
159 |
+
"lstrip": false,
|
160 |
+
"normalized": true,
|
161 |
+
"rstrip": false,
|
162 |
+
"single_word": false,
|
163 |
+
"special": false
|
164 |
+
},
|
165 |
+
"50276": {
|
166 |
+
"content": " ",
|
167 |
+
"lstrip": false,
|
168 |
+
"normalized": true,
|
169 |
+
"rstrip": false,
|
170 |
+
"single_word": false,
|
171 |
+
"special": false
|
172 |
+
},
|
173 |
+
"50277": {
|
174 |
+
"content": " ",
|
175 |
+
"lstrip": false,
|
176 |
+
"normalized": true,
|
177 |
+
"rstrip": false,
|
178 |
+
"single_word": false,
|
179 |
+
"special": false
|
180 |
+
},
|
181 |
+
"50278": {
|
182 |
+
"content": " ",
|
183 |
+
"lstrip": false,
|
184 |
+
"normalized": true,
|
185 |
+
"rstrip": false,
|
186 |
+
"single_word": false,
|
187 |
+
"special": false
|
188 |
+
},
|
189 |
+
"50279": {
|
190 |
+
"content": " ",
|
191 |
+
"lstrip": false,
|
192 |
+
"normalized": true,
|
193 |
+
"rstrip": false,
|
194 |
+
"single_word": false,
|
195 |
+
"special": false
|
196 |
+
},
|
197 |
+
"50280": {
|
198 |
+
"content": " ",
|
199 |
+
"lstrip": false,
|
200 |
+
"normalized": true,
|
201 |
+
"rstrip": false,
|
202 |
+
"single_word": false,
|
203 |
+
"special": false
|
204 |
+
},
|
205 |
+
"50281": {
|
206 |
+
"content": " ",
|
207 |
+
"lstrip": false,
|
208 |
+
"normalized": true,
|
209 |
+
"rstrip": false,
|
210 |
+
"single_word": false,
|
211 |
+
"special": false
|
212 |
+
},
|
213 |
+
"50282": {
|
214 |
+
"content": " ",
|
215 |
+
"lstrip": false,
|
216 |
+
"normalized": true,
|
217 |
+
"rstrip": false,
|
218 |
+
"single_word": false,
|
219 |
+
"special": false
|
220 |
+
},
|
221 |
+
"50283": {
|
222 |
+
"content": " ",
|
223 |
+
"lstrip": false,
|
224 |
+
"normalized": true,
|
225 |
+
"rstrip": false,
|
226 |
+
"single_word": false,
|
227 |
+
"special": false
|
228 |
+
},
|
229 |
+
"50284": {
|
230 |
+
"content": " ",
|
231 |
+
"lstrip": false,
|
232 |
+
"normalized": true,
|
233 |
+
"rstrip": false,
|
234 |
+
"single_word": false,
|
235 |
+
"special": false
|
236 |
+
},
|
237 |
+
"50285": {
|
238 |
+
"content": " ",
|
239 |
+
"lstrip": false,
|
240 |
+
"normalized": true,
|
241 |
+
"rstrip": false,
|
242 |
+
"single_word": false,
|
243 |
+
"special": false
|
244 |
+
},
|
245 |
+
"50286": {
|
246 |
+
"content": " ",
|
247 |
+
"lstrip": false,
|
248 |
+
"normalized": true,
|
249 |
+
"rstrip": false,
|
250 |
+
"single_word": false,
|
251 |
+
"special": false
|
252 |
+
},
|
253 |
+
"50287": {
|
254 |
+
"content": "\t\t\t\t\t\t\t\t\t",
|
255 |
+
"lstrip": false,
|
256 |
+
"normalized": true,
|
257 |
+
"rstrip": false,
|
258 |
+
"single_word": false,
|
259 |
+
"special": false
|
260 |
+
},
|
261 |
+
"50288": {
|
262 |
+
"content": "\t\t\t\t\t\t\t\t",
|
263 |
+
"lstrip": false,
|
264 |
+
"normalized": true,
|
265 |
+
"rstrip": false,
|
266 |
+
"single_word": false,
|
267 |
+
"special": false
|
268 |
+
},
|
269 |
+
"50289": {
|
270 |
+
"content": "\t\t\t\t\t\t\t",
|
271 |
+
"lstrip": false,
|
272 |
+
"normalized": true,
|
273 |
+
"rstrip": false,
|
274 |
+
"single_word": false,
|
275 |
+
"special": false
|
276 |
+
},
|
277 |
+
"50290": {
|
278 |
+
"content": "\t\t\t\t\t\t",
|
279 |
+
"lstrip": false,
|
280 |
+
"normalized": true,
|
281 |
+
"rstrip": false,
|
282 |
+
"single_word": false,
|
283 |
+
"special": false
|
284 |
+
},
|
285 |
+
"50291": {
|
286 |
+
"content": "\t\t\t\t\t",
|
287 |
+
"lstrip": false,
|
288 |
+
"normalized": true,
|
289 |
+
"rstrip": false,
|
290 |
+
"single_word": false,
|
291 |
+
"special": false
|
292 |
+
},
|
293 |
+
"50292": {
|
294 |
+
"content": "\t\t\t\t",
|
295 |
+
"lstrip": false,
|
296 |
+
"normalized": true,
|
297 |
+
"rstrip": false,
|
298 |
+
"single_word": false,
|
299 |
+
"special": false
|
300 |
+
},
|
301 |
+
"50293": {
|
302 |
+
"content": "\t\t\t",
|
303 |
+
"lstrip": false,
|
304 |
+
"normalized": true,
|
305 |
+
"rstrip": false,
|
306 |
+
"single_word": false,
|
307 |
+
"special": false
|
308 |
+
},
|
309 |
+
"50294": {
|
310 |
+
"content": "\t\t",
|
311 |
+
"lstrip": false,
|
312 |
+
"normalized": true,
|
313 |
+
"rstrip": false,
|
314 |
+
"single_word": false,
|
315 |
+
"special": false
|
316 |
+
},
|
317 |
+
"50295": {
|
318 |
+
"content": "<|im_end|>",
|
319 |
+
"lstrip": false,
|
320 |
+
"normalized": false,
|
321 |
+
"rstrip": false,
|
322 |
+
"single_word": false,
|
323 |
+
"special": true
|
324 |
+
},
|
325 |
+
"50296": {
|
326 |
+
"content": "<|im_start|>",
|
327 |
+
"lstrip": false,
|
328 |
+
"normalized": false,
|
329 |
+
"rstrip": false,
|
330 |
+
"single_word": false,
|
331 |
+
"special": false
|
332 |
+
}
|
333 |
+
},
|
334 |
+
"bos_token": "<|endoftext|>",
|
335 |
+
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
336 |
+
"clean_up_tokenization_spaces": true,
|
337 |
+
"eos_token": "<|im_end|>",
|
338 |
+
"errors": "replace",
|
339 |
+
"model_max_length": 2048,
|
340 |
+
"pad_token": "<|endoftext|>",
|
341 |
+
"padding_side": "right",
|
342 |
+
"split_special_tokens": false,
|
343 |
+
"tokenizer_class": "CodeGenTokenizer",
|
344 |
+
"unk_token": "<|endoftext|>"
|
345 |
+
}
|
train_eval_log.txt
ADDED
@@ -0,0 +1,1479 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Using RTX 3090 or 4000 series which doesn't support faster communication speedups. Ensuring P2P and IB communications are disabled.
|
2 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training.
|
3 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training.
|
4 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
|
5 |
+
[INFO|training_args.py:1838] 2024-01-04 09:53:50,866 >> PyTorch: setting up devices
|
6 |
+
/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
|
7 |
+
warnings.warn(
|
8 |
+
01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Process rank: 0, device: cuda:0, n_gpu: 1
|
9 |
+
distributed training: True, compute dtype: None
|
10 |
+
01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
|
11 |
+
_n_gpu=1,
|
12 |
+
adafactor=False,
|
13 |
+
adam_beta1=0.9,
|
14 |
+
adam_beta2=0.999,
|
15 |
+
adam_epsilon=1e-08,
|
16 |
+
auto_find_batch_size=False,
|
17 |
+
bf16=False,
|
18 |
+
bf16_full_eval=False,
|
19 |
+
data_seed=None,
|
20 |
+
dataloader_drop_last=False,
|
21 |
+
dataloader_num_workers=0,
|
22 |
+
dataloader_persistent_workers=False,
|
23 |
+
dataloader_pin_memory=True,
|
24 |
+
ddp_backend=None,
|
25 |
+
ddp_broadcast_buffers=None,
|
26 |
+
ddp_bucket_cap_mb=None,
|
27 |
+
ddp_find_unused_parameters=False,
|
28 |
+
ddp_timeout=1800,
|
29 |
+
debug=[],
|
30 |
+
deepspeed=None,
|
31 |
+
disable_tqdm=False,
|
32 |
+
dispatch_batches=None,
|
33 |
+
do_eval=True,
|
34 |
+
do_predict=False,
|
35 |
+
do_train=True,
|
36 |
+
eval_accumulation_steps=None,
|
37 |
+
eval_delay=0,
|
38 |
+
eval_steps=None,
|
39 |
+
evaluation_strategy=IntervalStrategy.EPOCH,
|
40 |
+
fp16=False,
|
41 |
+
fp16_backend=auto,
|
42 |
+
fp16_full_eval=False,
|
43 |
+
fp16_opt_level=O1,
|
44 |
+
fsdp=[],
|
45 |
+
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
|
46 |
+
fsdp_min_num_params=0,
|
47 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
48 |
+
full_determinism=False,
|
49 |
+
generation_config=None,
|
50 |
+
generation_max_length=None,
|
51 |
+
generation_num_beams=None,
|
52 |
+
gradient_accumulation_steps=4,
|
53 |
+
gradient_checkpointing=False,
|
54 |
+
gradient_checkpointing_kwargs=None,
|
55 |
+
greater_is_better=None,
|
56 |
+
group_by_length=False,
|
57 |
+
half_precision_backend=auto,
|
58 |
+
hub_always_push=False,
|
59 |
+
hub_model_id=None,
|
60 |
+
hub_private_repo=False,
|
61 |
+
hub_strategy=HubStrategy.EVERY_SAVE,
|
62 |
+
hub_token=<HUB_TOKEN>,
|
63 |
+
ignore_data_skip=False,
|
64 |
+
include_inputs_for_metrics=False,
|
65 |
+
include_num_input_tokens_seen=False,
|
66 |
+
include_tokens_per_second=False,
|
67 |
+
jit_mode_eval=False,
|
68 |
+
label_names=None,
|
69 |
+
label_smoothing_factor=0.0,
|
70 |
+
learning_rate=5e-05,
|
71 |
+
length_column_name=length,
|
72 |
+
load_best_model_at_end=False,
|
73 |
+
local_rank=0,
|
74 |
+
log_level=passive,
|
75 |
+
log_level_replica=warning,
|
76 |
+
log_on_each_node=True,
|
77 |
+
logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/runs/Jan04_09-53-50_yhyu13fuwuqi,
|
78 |
+
logging_first_step=False,
|
79 |
+
logging_nan_inf_filter=True,
|
80 |
+
logging_steps=10,
|
81 |
+
logging_strategy=IntervalStrategy.STEPS,
|
82 |
+
lr_scheduler_kwargs={},
|
83 |
+
lr_scheduler_type=SchedulerType.COSINE,
|
84 |
+
max_grad_norm=1.0,
|
85 |
+
max_steps=-1,
|
86 |
+
metric_for_best_model=None,
|
87 |
+
mp_parameters=,
|
88 |
+
neftune_noise_alpha=None,
|
89 |
+
no_cuda=False,
|
90 |
+
num_train_epochs=1.0,
|
91 |
+
optim=OptimizerNames.ADAMW_TORCH,
|
92 |
+
optim_args=None,
|
93 |
+
output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
|
94 |
+
overwrite_output_dir=True,
|
95 |
+
past_index=-1,
|
96 |
+
per_device_eval_batch_size=1,
|
97 |
+
per_device_train_batch_size=1,
|
98 |
+
predict_with_generate=False,
|
99 |
+
prediction_loss_only=True,
|
100 |
+
push_to_hub=False,
|
101 |
+
push_to_hub_model_id=None,
|
102 |
+
push_to_hub_organization=None,
|
103 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
104 |
+
ray_scope=last,
|
105 |
+
remove_unused_columns=True,
|
106 |
+
report_to=['tensorboard'],
|
107 |
+
resume_from_checkpoint=None,
|
108 |
+
run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
|
109 |
+
save_on_each_node=False,
|
110 |
+
save_only_model=False,
|
111 |
+
save_safetensors=True,
|
112 |
+
save_steps=1000,
|
113 |
+
save_strategy=IntervalStrategy.STEPS,
|
114 |
+
save_total_limit=None,
|
115 |
+
seed=42,
|
116 |
+
skip_memory_metrics=True,
|
117 |
+
sortish_sampler=False,
|
118 |
+
split_batches=False,
|
119 |
+
tf32=None,
|
120 |
+
torch_compile=False,
|
121 |
+
torch_compile_backend=None,
|
122 |
+
torch_compile_mode=None,
|
123 |
+
torchdynamo=None,
|
124 |
+
tpu_metrics_debug=False,
|
125 |
+
tpu_num_cores=None,
|
126 |
+
use_cpu=False,
|
127 |
+
use_ipex=False,
|
128 |
+
use_legacy_prediction_loop=False,
|
129 |
+
use_mps_device=False,
|
130 |
+
warmup_ratio=0.0,
|
131 |
+
warmup_steps=0,
|
132 |
+
weight_decay=0.0,
|
133 |
+
)
|
134 |
+
01/04/2024 09:53:50 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
|
135 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
|
136 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable `upcast_layernorm` in quantized training.
|
137 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - We recommend enable mixed precision training.
|
138 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.model.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.
|
139 |
+
/home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/transformers/training_args.py:1751: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
|
140 |
+
warnings.warn(
|
141 |
+
01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Process rank: 1, device: cuda:1, n_gpu: 1
|
142 |
+
distributed training: True, compute dtype: None
|
143 |
+
01/04/2024 09:53:50 - INFO - llmtuner.model.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
|
144 |
+
_n_gpu=1,
|
145 |
+
adafactor=False,
|
146 |
+
adam_beta1=0.9,
|
147 |
+
adam_beta2=0.999,
|
148 |
+
adam_epsilon=1e-08,
|
149 |
+
auto_find_batch_size=False,
|
150 |
+
bf16=False,
|
151 |
+
bf16_full_eval=False,
|
152 |
+
data_seed=None,
|
153 |
+
dataloader_drop_last=False,
|
154 |
+
dataloader_num_workers=0,
|
155 |
+
dataloader_persistent_workers=False,
|
156 |
+
dataloader_pin_memory=True,
|
157 |
+
ddp_backend=None,
|
158 |
+
ddp_broadcast_buffers=None,
|
159 |
+
ddp_bucket_cap_mb=None,
|
160 |
+
ddp_find_unused_parameters=False,
|
161 |
+
ddp_timeout=1800,
|
162 |
+
debug=[],
|
163 |
+
deepspeed=None,
|
164 |
+
disable_tqdm=False,
|
165 |
+
dispatch_batches=None,
|
166 |
+
do_eval=True,
|
167 |
+
do_predict=False,
|
168 |
+
do_train=True,
|
169 |
+
eval_accumulation_steps=None,
|
170 |
+
eval_delay=0,
|
171 |
+
eval_steps=None,
|
172 |
+
evaluation_strategy=IntervalStrategy.EPOCH,
|
173 |
+
fp16=False,
|
174 |
+
fp16_backend=auto,
|
175 |
+
fp16_full_eval=False,
|
176 |
+
fp16_opt_level=O1,
|
177 |
+
fsdp=[],
|
178 |
+
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
|
179 |
+
fsdp_min_num_params=0,
|
180 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
181 |
+
full_determinism=False,
|
182 |
+
generation_config=None,
|
183 |
+
generation_max_length=None,
|
184 |
+
generation_num_beams=None,
|
185 |
+
gradient_accumulation_steps=4,
|
186 |
+
gradient_checkpointing=False,
|
187 |
+
gradient_checkpointing_kwargs=None,
|
188 |
+
greater_is_better=None,
|
189 |
+
group_by_length=False,
|
190 |
+
half_precision_backend=auto,
|
191 |
+
hub_always_push=False,
|
192 |
+
hub_model_id=None,
|
193 |
+
hub_private_repo=False,
|
194 |
+
hub_strategy=HubStrategy.EVERY_SAVE,
|
195 |
+
hub_token=<HUB_TOKEN>,
|
196 |
+
ignore_data_skip=False,
|
197 |
+
include_inputs_for_metrics=False,
|
198 |
+
include_num_input_tokens_seen=False,
|
199 |
+
include_tokens_per_second=False,
|
200 |
+
jit_mode_eval=False,
|
201 |
+
label_names=None,
|
202 |
+
label_smoothing_factor=0.0,
|
203 |
+
learning_rate=5e-05,
|
204 |
+
length_column_name=length,
|
205 |
+
load_best_model_at_end=False,
|
206 |
+
local_rank=1,
|
207 |
+
log_level=passive,
|
208 |
+
log_level_replica=warning,
|
209 |
+
log_on_each_node=True,
|
210 |
+
logging_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/runs/Jan04_09-53-50_yhyu13fuwuqi,
|
211 |
+
logging_first_step=False,
|
212 |
+
logging_nan_inf_filter=True,
|
213 |
+
logging_steps=10,
|
214 |
+
logging_strategy=IntervalStrategy.STEPS,
|
215 |
+
lr_scheduler_kwargs={},
|
216 |
+
lr_scheduler_type=SchedulerType.COSINE,
|
217 |
+
max_grad_norm=1.0,
|
218 |
+
max_steps=-1,
|
219 |
+
metric_for_best_model=None,
|
220 |
+
mp_parameters=,
|
221 |
+
neftune_noise_alpha=None,
|
222 |
+
no_cuda=False,
|
223 |
+
num_train_epochs=1.0,
|
224 |
+
optim=OptimizerNames.ADAMW_TORCH,
|
225 |
+
optim_args=None,
|
226 |
+
output_dir=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
|
227 |
+
overwrite_output_dir=True,
|
228 |
+
past_index=-1,
|
229 |
+
per_device_eval_batch_size=1,
|
230 |
+
per_device_train_batch_size=1,
|
231 |
+
predict_with_generate=False,
|
232 |
+
prediction_loss_only=True,
|
233 |
+
push_to_hub=False,
|
234 |
+
push_to_hub_model_id=None,
|
235 |
+
push_to_hub_organization=None,
|
236 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
237 |
+
ray_scope=last,
|
238 |
+
remove_unused_columns=True,
|
239 |
+
report_to=['tensorboard'],
|
240 |
+
resume_from_checkpoint=None,
|
241 |
+
run_name=./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora,
|
242 |
+
save_on_each_node=False,
|
243 |
+
save_only_model=False,
|
244 |
+
save_safetensors=True,
|
245 |
+
save_steps=1000,
|
246 |
+
save_strategy=IntervalStrategy.STEPS,
|
247 |
+
save_total_limit=None,
|
248 |
+
seed=42,
|
249 |
+
skip_memory_metrics=True,
|
250 |
+
sortish_sampler=False,
|
251 |
+
split_batches=False,
|
252 |
+
tf32=None,
|
253 |
+
torch_compile=False,
|
254 |
+
torch_compile_backend=None,
|
255 |
+
torch_compile_mode=None,
|
256 |
+
torchdynamo=None,
|
257 |
+
tpu_metrics_debug=False,
|
258 |
+
tpu_num_cores=None,
|
259 |
+
use_cpu=False,
|
260 |
+
use_ipex=False,
|
261 |
+
use_legacy_prediction_loop=False,
|
262 |
+
use_mps_device=False,
|
263 |
+
warmup_ratio=0.0,
|
264 |
+
warmup_steps=0,
|
265 |
+
weight_decay=0.0,
|
266 |
+
)
|
267 |
+
01/04/2024 09:53:50 - INFO - llmtuner.data.loader - Loading dataset ./glaive-function-calling-v2/simple-function-calling-v2_converted.json...
|
268 |
+
01/04/2024 09:53:50 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
|
269 |
+
Using custom data configuration default-b024aadef2a1493c
|
270 |
+
Loading Dataset Infos from /home/hangyu5/anaconda3/envs/llama_factory/lib/python3.11/site-packages/datasets/packaged_modules/json
|
271 |
+
Overwrite dataset info from restored data version if exists.
|
272 |
+
Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
|
273 |
+
Found cached dataset json (/home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
|
274 |
+
Loading Dataset info from /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
|
275 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file vocab.json
|
276 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file merges.txt
|
277 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file added_tokens.json
|
278 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file special_tokens_map.json
|
279 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file tokenizer_config.json
|
280 |
+
[INFO|tokenization_utils_base.py:2024] 2024-01-04 09:53:51,685 >> loading file tokenizer.json
|
281 |
+
[WARNING|logging.py:314] 2024-01-04 09:53:51,743 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
282 |
+
[INFO|configuration_utils.py:737] 2024-01-04 09:53:51,744 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
|
283 |
+
[INFO|configuration_utils.py:737] 2024-01-04 09:53:51,749 >> loading configuration file cognitivecomputations/dolphin-2_6-phi-2/config.json
|
284 |
+
[INFO|configuration_utils.py:802] 2024-01-04 09:53:51,750 >> Model config PhiConfig {
|
285 |
+
"_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
|
286 |
+
"activation_function": "gelu_new",
|
287 |
+
"architectures": [
|
288 |
+
"PhiForCausalLM"
|
289 |
+
],
|
290 |
+
"attn_pdrop": 0.0,
|
291 |
+
"auto_map": {
|
292 |
+
"AutoConfig": "configuration_phi.PhiConfig",
|
293 |
+
"AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
|
294 |
+
},
|
295 |
+
"embd_pdrop": 0.0,
|
296 |
+
"flash_attn": false,
|
297 |
+
"flash_rotary": false,
|
298 |
+
"fused_dense": false,
|
299 |
+
"img_processor": null,
|
300 |
+
"initializer_range": 0.02,
|
301 |
+
"layer_norm_epsilon": 1e-05,
|
302 |
+
"model_type": "phi-msft",
|
303 |
+
"n_embd": 2560,
|
304 |
+
"n_head": 32,
|
305 |
+
"n_head_kv": null,
|
306 |
+
"n_inner": null,
|
307 |
+
"n_layer": 32,
|
308 |
+
"n_positions": 2048,
|
309 |
+
"resid_pdrop": 0.1,
|
310 |
+
"rotary_dim": 32,
|
311 |
+
"tie_word_embeddings": false,
|
312 |
+
"torch_dtype": "float16",
|
313 |
+
"transformers_version": "4.36.2",
|
314 |
+
"use_cache": false,
|
315 |
+
"vocab_size": 51200
|
316 |
+
}
|
317 |
+
|
318 |
+
01/04/2024 09:53:51 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit.
|
319 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
320 |
+
01/04/2024 09:53:51 - INFO - llmtuner.model.patcher - Quantizing model to 4 bit.
|
321 |
+
[INFO|modeling_utils.py:2907] 2024-01-04 09:53:51,820 >> Overriding torch_dtype=None with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.float16 to remove this warning.
|
322 |
+
[INFO|modeling_utils.py:3341] 2024-01-04 09:53:51,820 >> loading weights file cognitivecomputations/dolphin-2_6-phi-2/model.safetensors.index.json
|
323 |
+
[INFO|modeling_utils.py:1341] 2024-01-04 09:53:51,821 >> Instantiating PhiForCausalLM model under default dtype torch.float16.
|
324 |
+
[INFO|configuration_utils.py:826] 2024-01-04 09:53:51,821 >> Generate config GenerationConfig {
|
325 |
+
"use_cache": false
|
326 |
+
}
|
327 |
+
|
328 |
+
[INFO|configuration_utils.py:826] 2024-01-04 09:53:51,822 >> Generate config GenerationConfig {
|
329 |
+
"use_cache": false
|
330 |
+
}
|
331 |
+
|
332 |
+
[INFO|modeling_utils.py:3483] 2024-01-04 09:53:51,875 >> Detected 4-bit loading: activating 4-bit loading for this model
|
333 |
+
|
334 |
+
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]cognitivecomputations/dolphin-2_6-phi-2
|
335 |
+
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
|
336 |
+
Loading checkpoint shards: 50%|█████ | 1/2 [00:01<00:01, 1.28s/it]
|
337 |
+
Loading checkpoint shards: 50%|█████ | 1/2 [00:01<00:01, 1.27s/it]cognitivecomputations/dolphin-2_6-phi-2
|
338 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.46it/s]
|
339 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.29it/s]
|
340 |
+
[WARNING|modeling_utils.py:4175] 2024-01cognitivecomputations/dolphin-2_6-phi-2eights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
|
341 |
+
- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
|
342 |
+
- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
|
343 |
+
[INFO|modeling_utils.py:4193] 2024-01-04 09:53:53,730 >> All the weights of PhiForCausalcognitivecomputations/dolphin-2_6-phi-2he model checkpoint at ./models/dolphin-2_6-phi-2.
|
344 |
+
If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
|
345 |
+
|
346 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.47it/s]
|
347 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.30it/s]
|
348 |
+
Some weights of the model checkpoint at ./models/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_B.default.weight', 'lm_head.linear.lora_A.default.weight']
|
349 |
+
- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
|
350 |
+
- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
|
351 |
+
[INFO|configuration_utils.py:779] 2024-01-04 09:53:53,733 >> loading configuration file ./models/dolphin-2_6-phi-2/generation_config.json
|
352 |
+
[INFO|configuration_utils.py:826] 2024-01-04 09:53:53,733 >> Generate config GenerationConfig {}
|
353 |
+
|
354 |
+
[WARNING|modeling_utils.py:2045] 2024-01-04 09:53:53,816 >> You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
|
355 |
+
01/04/2024 09:53:53 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
|
356 |
+
01/04/2024 09:53:53 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
|
357 |
+
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
|
358 |
+
01/04/2024 09:53:53 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
|
359 |
+
01/04/2024 09:53:53 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
|
360 |
+
01/04/2024 09:53:53 - INFO - llmtuner.model.loader - trainable params: 2621440 || all params: 2782305280 || trainable%: 0.0942
|
361 |
+
01/04/2024 09:53:53 - INFO - llmtuner.model.loader - trainable params: 2621440 || all params: 2782305280 || trainable%: 0.0942
|
362 |
+
|
363 |
+
Running tokenizer on dataset: 0%| | 0/3347 [00:00<?, ? examples/s][WARNING|tokenization_utils_base.py:3835] 2024-01-04 09:53:55,217 >> Token indices sequence length is longer than the specified maximum sequence length for this model (2217 > 2048). Running this sequence through the model will result in indexing errors
|
364 |
+
Caching processed dataset at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c64b6c6785bc1929.arrow
|
365 |
+
|
366 |
+
Running tokenizer on dataset: 30%|██▉ | 1000/3347 [00:02<00:06, 372.68 examples/s]
|
367 |
+
Running tokenizer on dataset: 60%|█████▉ | 2000/3347 [00:05<00:03, 387.09 examples/s]
|
368 |
+
Running tokenizer on dataset: 90%|████████▉ | 3000/3347 [00:07<00:00, 395.52 examples/s]
|
369 |
+
Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 396.84 examples/s]
|
370 |
+
Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 392.48 examples/s]
|
371 |
+
input_ids:
|
372 |
+
[32, 8537, 1022, 257, 11040, 2836, 290, 281, 11666, 4430, 8796, 13, 383, 8796, 3607, 7613, 11, 6496, 11, 290, 23507, 7429, 284, 262, 2836, 338, 2683, 13, 198, 20490, 25, 36230, 25, 921, 389, 257, 7613, 8796, 351, 1895, 284, 262, 1708, 5499, 13, 5765, 606, 611, 2672, 532, 198, 90, 198, 50284, 1, 3672, 1298, 366, 1136, 62, 1069, 3803, 62, 4873, 1600, 198, 50284, 1, 11213, 1298, 366, 3855, 262, 5163, 2494, 1022, 734, 19247, 1600, 198, 50284, 1, 17143, 7307, 1298, 1391, 198, 50280, 1, 4906, 1298, 366, 15252, 1600, 198, 50280, 1, 48310, 1298, 1391, 198, 50276, 1, 8692, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 422, 1, 198, 50276, 5512, 198, 50276, 1, 16793, 62, 34415, 1298, 1391, 198, 50272, 1, 4906, 1298, 366, 8841, 1600, 198, 50272, 1, 11213, 1298, 366, 464, 7395, 284, 10385, 284, 1, 198, 50276, 92, 198, 50280, 5512, 198, 50280, 1, 35827, 1298, 685, 198, 50276, 1, 8692, 62, 34415, 1600, 198, 50276, 1, 16793, 62, 34415, 1, 198, 50280, 60, 198, 50284, 92, 198, 92, 198, 198, 6090, 345, 1492, 257, 5474, 329, 502, 422, 968, 1971, 284, 3576, 30, 198, 48902, 25, 40, 1101, 7926, 11, 475, 314, 836, 470, 423, 262, 12971, 284, 1492, 13956, 13, 2011, 1459, 2163, 3578, 502, 284, 651, 262, 5163, 2494, 1022, 734, 19247, 13, 1002, 345, 761, 1037, 351, 326, 11, 1254, 1479, 284, 1265, 0, 50295]
|
373 |
+
inputs:
|
374 |
+
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
|
375 |
+
Human: SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -
|
376 |
+
{
|
377 |
+
"name": "get_exchange_rate",
|
378 |
+
"description": "Get the exchange rate between two currencies",
|
379 |
+
"parameters": {
|
380 |
+
"type": "object",
|
381 |
+
"properties": {
|
382 |
+
"base_currency": {
|
383 |
+
"type": "string",
|
384 |
+
"description": "The currency to convert from"
|
385 |
+
},
|
386 |
+
"target_currency": {
|
387 |
+
"type": "string",
|
388 |
+
"description": "The currency to convert to"
|
389 |
+
}
|
390 |
+
},
|
391 |
+
"required": [
|
392 |
+
"base_currency",
|
393 |
+
"target_currency"
|
394 |
+
]
|
395 |
+
}
|
396 |
+
}
|
397 |
+
|
398 |
+
Can you book a flight for me from New York to London?
|
399 |
+
Assistant:I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!<|im_end|>
|
400 |
+
label_ids:
|
401 |
+
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 40, 1101, 7926, 11, 475, 314, 836, 470, 423, 262, 12971, 284, 1492, 13956, 13, 2011, 1459, 2163, 3578, 502, 284, 651, 262, 5163, 2494, 1022, 734, 19247, 13, 1002, 345, 761, 1037, 351, 326, 11, 1254, 1479, 284, 1265, 0, 50295]
|
402 |
+
labels:
|
403 |
+
I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!<|im_end|>
|
404 |
+
[INFO|training_args.py:1838] 2024-01-04 09:54:03,936 >> PyTorch: setting up devices
|
405 |
+
Caching indices mapping at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2d738e000d25696c.arrow
|
406 |
+
Caching indices mapping at /home/hangyu5/.cache/huggingface/datasets/json/default-b024aadef2a1493c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-fe95a5c264c6067e.arrow
|
407 |
+
|
408 |
+
Running tokenizer on dataset: 0%| | 0/3347 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2217 > 2048). Running this sequence through the model will result in indexing errors
|
409 |
+
|
410 |
+
Running tokenizer on dataset: 30%|██▉ | 1000/3347 [00:02<00:06, 375.58 examples/s]
|
411 |
+
Running tokenizer on dataset: 60%|█████▉ | 2000/3347 [00:05<00:03, 389.75 examples/s]
|
412 |
+
Running tokenizer on dataset: 90%|████████▉ | 3000/3347 [00:07<00:00, 396.16 examples/s]
|
413 |
+
Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 395.57 examples/s]
|
414 |
+
Running tokenizer on dataset: 100%|██████████| 3347/3347 [00:08<00:00, 392.61 examples/s]
|
415 |
+
[INFO|trainer.py:1706] 2024-01-04 09:54:13,452 >> ***** Running training *****
|
416 |
+
[INFO|trainer.py:1707] 2024-01-04 09:54:13,452 >> Num examples = 3,011
|
417 |
+
[INFO|trainer.py:1708] 2024-01-04 09:54:13,452 >> Num Epochs = 1
|
418 |
+
[INFO|trainer.py:1709] 2024-01-04 09:54:13,452 >> Instantaneous batch size per device = 1
|
419 |
+
[INFO|trainer.py:1712] 2024-01-04 09:54:13,452 >> Total train batch size (w. parallel, distributed & accumulation) = 8
|
420 |
+
[INFO|trainer.py:1713] 2024-01-04 09:54:13,452 >> Gradient Accumulation steps = 4
|
421 |
+
[INFO|trainer.py:1714] 2024-01-04 09:54:13,452 >> Total optimization steps = 376
|
422 |
+
[INFO|trainer.py:1715] 2024-01-04 09:54:13,454 >> Number of trainable parameters = 2,621,440
|
423 |
+
|
424 |
+
0%| | 0/376 [00:00<?, ?it/s]
|
425 |
+
0%| | 1/376 [00:02<13:10, 2.11s/it]
|
426 |
+
1%| | 2/376 [00:03<09:45, 1.56s/it]
|
427 |
+
1%| | 3/376 [00:04<09:09, 1.47s/it]
|
428 |
+
1%| | 4/376 [00:06<08:49, 1.42s/it]
|
429 |
+
1%|▏ | 5/376 [00:07<08:34, 1.39s/it]
|
430 |
+
2%|▏ | 6/376 [00:09<09:48, 1.59s/it]
|
431 |
+
2%|▏ | 7/376 [00:10<09:20, 1.52s/it]
|
432 |
+
2%|▏ | 8/376 [00:11<08:46, 1.43s/it]
|
433 |
+
2%|▏ | 9/376 [00:13<08:32, 1.40s/it]
|
434 |
+
3%|▎ | 10/376 [00:14<08:10, 1.34s/it]
|
435 |
+
|
436 |
+
{'loss': 1.0017, 'learning_rate': 4.991278696516879e-05, 'epoch': 0.03}
|
437 |
+
|
438 |
+
3%|▎ | 10/376 [00:14<08:10, 1.34s/it]
|
439 |
+
3%|▎ | 11/376 [00:15<08:11, 1.35s/it]
|
440 |
+
3%|▎ | 12/376 [00:17<08:19, 1.37s/it]
|
441 |
+
3%|▎ | 13/376 [00:18<08:30, 1.41s/it]
|
442 |
+
4%|▎ | 14/376 [00:20<08:30, 1.41s/it]
|
443 |
+
4%|▍ | 15/376 [00:21<08:03, 1.34s/it]
|
444 |
+
4%|▍ | 16/376 [00:22<08:26, 1.41s/it]
|
445 |
+
5%|▍ | 17/376 [00:24<07:54, 1.32s/it]
|
446 |
+
5%|▍ | 18/376 [00:25<07:42, 1.29s/it]
|
447 |
+
5%|▌ | 19/376 [00:27<08:59, 1.51s/it]
|
448 |
+
5%|▌ | 20/376 [00:28<08:13, 1.39s/it]
|
449 |
+
|
450 |
+
{'loss': 0.881, 'learning_rate': 4.9651756349750716e-05, 'epoch': 0.05}
|
451 |
+
|
452 |
+
5%|▌ | 20/376 [00:28<08:13, 1.39s/it]
|
453 |
+
6%|▌ | 21/376 [00:29<08:05, 1.37s/it]
|
454 |
+
6%|▌ | 22/376 [00:31<09:07, 1.55s/it]
|
455 |
+
6%|▌ | 23/376 [00:32<08:33, 1.46s/it]
|
456 |
+
6%|▋ | 24/376 [00:34<08:40, 1.48s/it]
|
457 |
+
7%|▋ | 25/376 [00:35<08:38, 1.48s/it]
|
458 |
+
7%|▋ | 26/376 [00:37<08:25, 1.44s/it]
|
459 |
+
7%|▋ | 27/376 [00:38<08:02, 1.38s/it]
|
460 |
+
7%|▋ | 28/376 [00:40<08:37, 1.49s/it]
|
461 |
+
8%|▊ | 29/376 [00:41<08:31, 1.47s/it]
|
462 |
+
8%|▊ | 30/376 [00:43<08:23, 1.46s/it]
|
463 |
+
|
464 |
+
{'loss': 0.7979, 'learning_rate': 4.9218729375518135e-05, 'epoch': 0.08}
|
465 |
+
|
466 |
+
8%|▊ | 30/376 [00:43<08:23, 1.46s/it]
|
467 |
+
8%|▊ | 31/376 [00:44<08:11, 1.43s/it]
|
468 |
+
9%|▊ | 32/376 [00:45<08:12, 1.43s/it]
|
469 |
+
9%|▉ | 33/376 [00:47<08:13, 1.44s/it]
|
470 |
+
9%|▉ | 34/376 [00:48<08:23, 1.47s/it]
|
471 |
+
9%|▉ | 35/376 [00:50<08:44, 1.54s/it]
|
472 |
+
10%|▉ | 36/376 [00:51<08:28, 1.50s/it]
|
473 |
+
10%|▉ | 37/376 [00:53<08:19, 1.47s/it]
|
474 |
+
10%|█ | 38/376 [00:54<08:00, 1.42s/it]
|
475 |
+
10%|█ | 39/376 [00:56<08:26, 1.50s/it]
|
476 |
+
11%|█ | 40/376 [00:57<08:09, 1.46s/it]
|
477 |
+
|
478 |
+
{'loss': 0.7022, 'learning_rate': 4.861672729019797e-05, 'epoch': 0.11}
|
479 |
+
|
480 |
+
11%|█ | 40/376 [00:57<08:09, 1.46s/it]
|
481 |
+
11%|█ | 41/376 [00:59<07:51, 1.41s/it]
|
482 |
+
11%|█ | 42/376 [01:00<07:44, 1.39s/it]
|
483 |
+
11%|█▏ | 43/376 [01:01<07:24, 1.34s/it]
|
484 |
+
12%|█▏ | 44/376 [01:03<08:38, 1.56s/it]
|
485 |
+
12%|█▏ | 45/376 [01:04<08:07, 1.47s/it]
|
486 |
+
12%|█▏ | 46/376 [01:06<08:11, 1.49s/it]
|
487 |
+
12%|█▎ | 47/376 [01:07<07:35, 1.38s/it]
|
488 |
+
13%|█▎ | 48/376 [01:08<07:08, 1.31s/it]
|
489 |
+
13%|█▎ | 49/376 [01:09<07:01, 1.29s/it]
|
490 |
+
13%|█▎ | 50/376 [01:11<07:07, 1.31s/it]
|
491 |
+
|
492 |
+
{'loss': 0.5844, 'learning_rate': 4.784995028809707e-05, 'epoch': 0.13}
|
493 |
+
|
494 |
+
13%|█▎ | 50/376 [01:11<07:07, 1.31s/it]
|
495 |
+
14%|█▎ | 51/376 [01:12<06:59, 1.29s/it]
|
496 |
+
14%|█▍ | 52/376 [01:13<06:54, 1.28s/it]
|
497 |
+
14%|█▍ | 53/376 [01:14<06:26, 1.20s/it]
|
498 |
+
14%|█▍ | 54/376 [01:16<06:46, 1.26s/it]
|
499 |
+
15%|█▍ | 55/376 [01:17<06:54, 1.29s/it]
|
500 |
+
15%|█▍ | 56/376 [01:18<06:21, 1.19s/it]
|
501 |
+
15%|█▌ | 57/376 [01:19<06:25, 1.21s/it]
|
502 |
+
15%|█▌ | 58/376 [01:21<07:51, 1.48s/it]
|
503 |
+
16%|█▌ | 59/376 [01:23<07:14, 1.37s/it]
|
504 |
+
16%|█▌ | 60/376 [01:24<07:11, 1.36s/it]
|
505 |
+
|
506 |
+
{'loss': 0.4454, 'learning_rate': 4.692374820516679e-05, 'epoch': 0.16}
|
507 |
+
|
508 |
+
16%|█▌ | 60/376 [01:24<07:11, 1.36s/it]
|
509 |
+
16%|█▌ | 61/376 [01:26<07:36, 1.45s/it]
|
510 |
+
16%|█▋ | 62/376 [01:27<07:34, 1.45s/it]
|
511 |
+
17%|█▋ | 63/376 [01:29<08:09, 1.56s/it]
|
512 |
+
17%|█▋ | 64/376 [01:30<07:41, 1.48s/it]
|
513 |
+
17%|█▋ | 65/376 [01:31<07:23, 1.43s/it]
|
514 |
+
18%|█▊ | 66/376 [01:33<08:05, 1.57s/it]
|
515 |
+
18%|█▊ | 67/376 [01:35<07:37, 1.48s/it]
|
516 |
+
18%|█▊ | 68/376 [01:36<07:15, 1.42s/it]
|
517 |
+
18%|█▊ | 69/376 [01:37<06:37, 1.29s/it]
|
518 |
+
19%|█▊ | 70/376 [01:38<06:52, 1.35s/it]
|
519 |
+
|
520 |
+
{'loss': 0.4076, 'learning_rate': 4.584458319296868e-05, 'epoch': 0.19}
|
521 |
+
|
522 |
+
19%|█▊ | 70/376 [01:38<06:52, 1.35s/it]
|
523 |
+
19%|█▉ | 71/376 [01:40<06:42, 1.32s/it]
|
524 |
+
19%|█▉ | 72/376 [01:41<06:59, 1.38s/it]
|
525 |
+
19%|█▉ | 73/376 [01:42<06:56, 1.37s/it]
|
526 |
+
20%|█▉ | 74/376 [01:44<06:24, 1.27s/it]
|
527 |
+
20%|█▉ | 75/376 [01:45<06:42, 1.34s/it]
|
528 |
+
20%|██ | 76/376 [01:46<06:05, 1.22s/it]
|
529 |
+
20%|██ | 77/376 [01:48<06:59, 1.40s/it]
|
530 |
+
21%|██ | 78/376 [01:49<07:24, 1.49s/it]
|
531 |
+
21%|██ | 79/376 [01:51<06:56, 1.40s/it]
|
532 |
+
21%|██▏ | 80/376 [01:52<06:48, 1.38s/it]
|
533 |
+
|
534 |
+
{'loss': 0.4111, 'learning_rate': 4.4619984631966524e-05, 'epoch': 0.21}
|
535 |
+
|
536 |
+
21%|██▏ | 80/376 [01:52<06:48, 1.38s/it]
|
537 |
+
22%|██▏ | 81/376 [01:54<07:02, 1.43s/it]
|
538 |
+
22%|██▏ | 82/376 [01:55<06:47, 1.39s/it]
|
539 |
+
22%|██▏ | 83/376 [01:56<06:31, 1.34s/it]
|
540 |
+
22%|██▏ | 84/376 [01:58<07:21, 1.51s/it]
|
541 |
+
23%|██▎ | 85/376 [01:59<06:58, 1.44s/it]
|
542 |
+
23%|██▎ | 86/376 [02:01<07:04, 1.47s/it]
|
543 |
+
23%|██▎ | 87/376 [02:02<06:35, 1.37s/it]
|
544 |
+
23%|██▎ | 88/376 [02:03<06:43, 1.40s/it]
|
545 |
+
24%|██▎ | 89/376 [02:05<06:29, 1.36s/it]
|
546 |
+
24%|██▍ | 90/376 [02:07<07:18, 1.53s/it]
|
547 |
+
|
548 |
+
{'loss': 0.4115, 'learning_rate': 4.3258496598716736e-05, 'epoch': 0.24}
|
549 |
+
|
550 |
+
24%|██▍ | 90/376 [02:07<07:18, 1.53s/it]
|
551 |
+
24%|██▍ | 91/376 [02:08<07:15, 1.53s/it]
|
552 |
+
24%|██▍ | 92/376 [02:09<06:47, 1.44s/it]
|
553 |
+
25%|██▍ | 93/376 [02:10<06:11, 1.31s/it]
|
554 |
+
25%|██▌ | 94/376 [02:11<05:35, 1.19s/it]
|
555 |
+
25%|██▌ | 95/376 [02:13<05:53, 1.26s/it]
|
556 |
+
26%|██▌ | 96/376 [02:14<05:38, 1.21s/it]
|
557 |
+
26%|██▌ | 97/376 [02:15<05:32, 1.19s/it]
|
558 |
+
26%|██▌ | 98/376 [02:16<05:54, 1.27s/it]
|
559 |
+
26%|██▋ | 99/376 [02:18<06:20, 1.37s/it]
|
560 |
+
27%|██▋ | 100/376 [02:19<06:01, 1.31s/it]
|
561 |
+
|
562 |
+
{'loss': 0.3566, 'learning_rate': 4.176961825348059e-05, 'epoch': 0.27}
|
563 |
+
|
564 |
+
27%|██▋ | 100/376 [02:19<06:01, 1.31s/it]
|
565 |
+
27%|██▋ | 101/376 [02:21<06:14, 1.36s/it]
|
566 |
+
27%|██▋ | 102/376 [02:22<06:44, 1.48s/it]
|
567 |
+
27%|██▋ | 103/376 [02:24<06:32, 1.44s/it]
|
568 |
+
28%|██▊ | 104/376 [02:25<05:49, 1.28s/it]
|
569 |
+
28%|██▊ | 105/376 [02:26<06:19, 1.40s/it]
|
570 |
+
28%|██▊ | 106/376 [02:28<06:09, 1.37s/it]
|
571 |
+
28%|██▊ | 107/376 [02:29<06:05, 1.36s/it]
|
572 |
+
29%|██▊ | 108/376 [02:30<05:38, 1.26s/it]
|
573 |
+
29%|██▉ | 109/376 [02:32<06:20, 1.43s/it]
|
574 |
+
29%|██▉ | 110/376 [02:33<05:56, 1.34s/it]
|
575 |
+
|
576 |
+
{'loss': 0.4302, 'learning_rate': 4.016373756417669e-05, 'epoch': 0.29}
|
577 |
+
|
578 |
+
29%|██▉ | 110/376 [02:33<05:56, 1.34s/it]
|
579 |
+
30%|██▉ | 111/376 [02:34<05:40, 1.29s/it]
|
580 |
+
30%|██▉ | 112/376 [02:36<06:18, 1.44s/it]
|
581 |
+
30%|███ | 113/376 [02:38<06:33, 1.50s/it]
|
582 |
+
30%|███ | 114/376 [02:39<06:25, 1.47s/it]
|
583 |
+
31%|███ | 115/376 [02:40<06:25, 1.48s/it]
|
584 |
+
31%|███ | 116/376 [02:42<06:47, 1.57s/it]
|
585 |
+
31%|███ | 117/376 [02:44<06:34, 1.52s/it]
|
586 |
+
31%|███▏ | 118/376 [02:45<05:55, 1.38s/it]
|
587 |
+
32%|███▏ | 119/376 [02:46<05:38, 1.32s/it]
|
588 |
+
32%|███▏ | 120/376 [02:47<05:55, 1.39s/it]
|
589 |
+
|
590 |
+
{'loss': 0.4271, 'learning_rate': 3.845205882908432e-05, 'epoch': 0.32}
|
591 |
+
|
592 |
+
32%|███▏ | 120/376 [02:47<05:55, 1.39s/it]
|
593 |
+
32%|███▏ | 121/376 [02:49<05:40, 1.33s/it]
|
594 |
+
32%|███▏ | 122/376 [02:50<05:38, 1.33s/it]
|
595 |
+
33%|███▎ | 123/376 [02:51<05:13, 1.24s/it]
|
596 |
+
33%|███▎ | 124/376 [02:52<05:25, 1.29s/it]
|
597 |
+
33%|███▎ | 125/376 [02:54<05:48, 1.39s/it]
|
598 |
+
34%|███▎ | 126/376 [02:55<05:43, 1.37s/it]
|
599 |
+
34%|███▍ | 127/376 [02:57<06:29, 1.57s/it]
|
600 |
+
34%|███▍ | 128/376 [02:59<06:08, 1.49s/it]
|
601 |
+
34%|███▍ | 129/376 [03:00<06:32, 1.59s/it]
|
602 |
+
35%|███▍ | 130/376 [03:02<06:31, 1.59s/it]
|
603 |
+
|
604 |
+
{'loss': 0.4625, 'learning_rate': 3.6646524503974955e-05, 'epoch': 0.35}
|
605 |
+
|
606 |
+
35%|███▍ | 130/376 [03:02<06:31, 1.59s/it]
|
607 |
+
35%|███▍ | 131/376 [03:04<06:39, 1.63s/it]
|
608 |
+
35%|███▌ | 132/376 [03:05<06:34, 1.61s/it]
|
609 |
+
35%|███▌ | 133/376 [03:06<05:56, 1.47s/it]
|
610 |
+
36%|███▌ | 134/376 [03:08<05:40, 1.41s/it]
|
611 |
+
36%|███▌ | 135/376 [03:09<05:27, 1.36s/it]
|
612 |
+
36%|███▌ | 136/376 [03:11<06:03, 1.52s/it]
|
613 |
+
36%|███▋ | 137/376 [03:12<05:38, 1.42s/it]
|
614 |
+
37%|███▋ | 138/376 [03:13<05:37, 1.42s/it]
|
615 |
+
37%|███▋ | 139/376 [03:15<05:34, 1.41s/it]
|
616 |
+
37%|███▋ | 140/376 [03:17<05:54, 1.50s/it]
|
617 |
+
|
618 |
+
{'loss': 0.5066, 'learning_rate': 3.475973187908737e-05, 'epoch': 0.37}
|
619 |
+
|
620 |
+
37%|███▋ | 140/376 [03:17<05:54, 1.50s/it]
|
621 |
+
38%|███▊ | 141/376 [03:18<05:46, 1.47s/it]
|
622 |
+
38%|███▊ | 142/376 [03:20<05:55, 1.52s/it]
|
623 |
+
38%|███▊ | 143/376 [03:21<06:18, 1.62s/it]
|
624 |
+
38%|███▊ | 144/376 [03:23<05:49, 1.51s/it]
|
625 |
+
39%|███▊ | 145/376 [03:24<05:39, 1.47s/it]
|
626 |
+
39%|███▉ | 146/376 [03:25<05:32, 1.44s/it]
|
627 |
+
39%|███▉ | 147/376 [03:27<05:25, 1.42s/it]
|
628 |
+
39%|███▉ | 148/376 [03:28<05:30, 1.45s/it]
|
629 |
+
40%|███▉ | 149/376 [03:30<05:54, 1.56s/it]
|
630 |
+
40%|███▉ | 150/376 [03:32<05:37, 1.49s/it]
|
631 |
+
|
632 |
+
{'loss': 0.3887, 'learning_rate': 3.280484518729466e-05, 'epoch': 0.4}
|
633 |
+
|
634 |
+
40%|███▉ | 150/376 [03:32<05:37, 1.49s/it]
|
635 |
+
40%|████ | 151/376 [03:33<05:14, 1.40s/it]
|
636 |
+
40%|████ | 152/376 [03:34<04:52, 1.30s/it]
|
637 |
+
41%|████ | 153/376 [03:35<04:54, 1.32s/it]
|
638 |
+
41%|████ | 154/376 [03:37<04:56, 1.34s/it]
|
639 |
+
41%|████ | 155/376 [03:38<05:14, 1.42s/it]
|
640 |
+
41%|████▏ | 156/376 [03:40<05:08, 1.40s/it]
|
641 |
+
42%|████▏ | 157/376 [03:41<04:45, 1.30s/it]
|
642 |
+
42%|████▏ | 158/376 [03:42<04:29, 1.24s/it]
|
643 |
+
42%|████▏ | 159/376 [03:43<04:41, 1.30s/it]
|
644 |
+
43%|████▎ | 160/376 [03:44<04:41, 1.30s/it]
|
645 |
+
|
646 |
+
{'loss': 0.3675, 'learning_rate': 3.079550375668821e-05, 'epoch': 0.42}
|
647 |
+
|
648 |
+
43%|████▎ | 160/376 [03:44<04:41, 1.30s/it]
|
649 |
+
43%|████▎ | 161/376 [03:46<04:59, 1.39s/it]
|
650 |
+
43%|████▎ | 162/376 [03:47<04:35, 1.29s/it]
|
651 |
+
43%|████▎ | 163/376 [03:48<04:36, 1.30s/it]
|
652 |
+
44%|████▎ | 164/376 [03:50<04:34, 1.30s/it]
|
653 |
+
44%|████▍ | 165/376 [03:51<04:41, 1.33s/it]
|
654 |
+
44%|████▍ | 166/376 [03:53<05:04, 1.45s/it]
|
655 |
+
44%|████▍ | 167/376 [03:54<05:04, 1.46s/it]
|
656 |
+
45%|████▍ | 168/376 [03:56<05:02, 1.45s/it]
|
657 |
+
45%|████▍ | 169/376 [03:57<05:08, 1.49s/it]
|
658 |
+
45%|████▌ | 170/376 [03:59<05:00, 1.46s/it]
|
659 |
+
|
660 |
+
{'loss': 0.4095, 'learning_rate': 2.8745726848402036e-05, 'epoch': 0.45}
|
661 |
+
|
662 |
+
45%|████▌ | 170/376 [03:59<05:00, 1.46s/it]
|
663 |
+
45%|████▌ | 171/376 [04:00<04:57, 1.45s/it]
|
664 |
+
46%|████▌ | 172/376 [04:01<04:26, 1.31s/it]
|
665 |
+
46%|████▌ | 173/376 [04:02<04:12, 1.24s/it]
|
666 |
+
46%|████▋ | 174/376 [04:04<04:28, 1.33s/it]
|
667 |
+
47%|████▋ | 175/376 [04:05<04:35, 1.37s/it]
|
668 |
+
47%|████▋ | 176/376 [04:07<04:50, 1.45s/it]
|
669 |
+
47%|████▋ | 177/376 [04:09<05:17, 1.60s/it]
|
670 |
+
47%|████▋ | 178/376 [04:10<04:52, 1.48s/it]
|
671 |
+
48%|████▊ | 179/376 [04:11<04:47, 1.46s/it]
|
672 |
+
48%|████▊ | 180/376 [04:13<04:29, 1.37s/it]
|
673 |
+
|
674 |
+
{'loss': 0.3782, 'learning_rate': 2.6669815843628042e-05, 'epoch': 0.48}
|
675 |
+
|
676 |
+
48%|████▊ | 180/376 [04:13<04:29, 1.37s/it]
|
677 |
+
48%|████▊ | 181/376 [04:14<04:25, 1.36s/it]
|
678 |
+
48%|████▊ | 182/376 [04:15<04:18, 1.33s/it]
|
679 |
+
49%|████▊ | 183/376 [04:17<04:42, 1.46s/it]
|
680 |
+
49%|████▉ | 184/376 [04:18<04:11, 1.31s/it]
|
681 |
+
49%|████▉ | 185/376 [04:19<04:02, 1.27s/it]
|
682 |
+
49%|████▉ | 186/376 [04:21<04:45, 1.50s/it]
|
683 |
+
50%|████▉ | 187/376 [04:22<04:26, 1.41s/it]
|
684 |
+
50%|█████ | 188/376 [04:24<04:14, 1.35s/it]
|
685 |
+
50%|█████ | 189/376 [04:26<05:00, 1.61s/it]
|
686 |
+
51%|█████ | 190/376 [04:28<05:18, 1.71s/it]
|
687 |
+
|
688 |
+
{'loss': 0.4195, 'learning_rate': 2.4582254462267476e-05, 'epoch': 0.5}
|
689 |
+
|
690 |
+
51%|█████ | 190/376 [04:28<05:18, 1.71s/it]
|
691 |
+
51%|█████ | 191/376 [04:29<04:55, 1.60s/it]
|
692 |
+
51%|█████ | 192/376 [04:30<04:48, 1.57s/it]
|
693 |
+
51%|█████▏ | 193/376 [04:32<04:46, 1.56s/it]
|
694 |
+
52%|█████▏ | 194/376 [04:33<04:34, 1.51s/it]
|
695 |
+
52%|█████▏ | 195/376 [04:36<05:04, 1.68s/it]
|
696 |
+
52%|█████▏ | 196/376 [04:37<04:39, 1.55s/it]
|
697 |
+
52%|█████▏ | 197/376 [04:38<04:32, 1.52s/it]
|
698 |
+
53%|█████▎ | 198/376 [04:40<04:39, 1.57s/it]
|
699 |
+
53%|█████▎ | 199/376 [04:42<04:42, 1.59s/it]
|
700 |
+
53%|█████▎ | 200/376 [04:43<04:25, 1.51s/it]
|
701 |
+
|
702 |
+
{'loss': 0.3392, 'learning_rate': 2.2497607709397543e-05, 'epoch': 0.53}
|
703 |
+
|
704 |
+
53%|█████▎ | 200/376 [04:43<04:25, 1.51s/it]
|
705 |
+
53%|█████▎ | 201/376 [04:44<04:13, 1.45s/it]
|
706 |
+
54%|█████▎ | 202/376 [04:45<03:58, 1.37s/it]
|
707 |
+
54%|█████▍ | 203/376 [04:47<03:53, 1.35s/it]
|
708 |
+
54%|█████▍ | 204/376 [04:48<03:38, 1.27s/it]
|
709 |
+
55%|█████▍ | 205/376 [04:49<03:46, 1.33s/it]
|
710 |
+
55%|█████▍ | 206/376 [04:51<03:55, 1.38s/it]
|
711 |
+
55%|█████▌ | 207/376 [04:52<04:11, 1.49s/it]
|
712 |
+
55%|█████▌ | 208/376 [04:54<04:02, 1.44s/it]
|
713 |
+
56%|█████▌ | 209/376 [04:55<03:40, 1.32s/it]
|
714 |
+
56%|█████▌ | 210/376 [04:56<03:21, 1.21s/it]
|
715 |
+
|
716 |
+
{'loss': 0.3347, 'learning_rate': 2.0430420254607748e-05, 'epoch': 0.56}
|
717 |
+
|
718 |
+
56%|█████▌ | 210/376 [04:56<03:21, 1.21s/it]
|
719 |
+
56%|█████▌ | 211/376 [04:57<03:18, 1.20s/it]
|
720 |
+
56%|█████▋ | 212/376 [04:59<03:42, 1.36s/it]
|
721 |
+
57%|█████▋ | 213/376 [05:00<03:41, 1.36s/it]
|
722 |
+
57%|█████▋ | 214/376 [05:01<03:44, 1.39s/it]
|
723 |
+
57%|█████▋ | 215/376 [05:03<03:28, 1.30s/it]
|
724 |
+
57%|█████▋ | 216/376 [05:04<03:34, 1.34s/it]
|
725 |
+
58%|█████▊ | 217/376 [05:05<03:38, 1.37s/it]
|
726 |
+
58%|█████▊ | 218/376 [05:07<03:38, 1.38s/it]
|
727 |
+
58%|█████▊ | 219/376 [05:08<03:30, 1.34s/it]
|
728 |
+
59%|█████▊ | 220/376 [05:09<03:21, 1.29s/it]
|
729 |
+
|
730 |
+
{'loss': 0.4117, 'learning_rate': 1.8395114953217852e-05, 'epoch': 0.58}
|
731 |
+
|
732 |
+
59%|█████▊ | 220/376 [05:09<03:21, 1.29s/it]
|
733 |
+
59%|█████▉ | 221/376 [05:11<03:19, 1.29s/it]
|
734 |
+
59%|█████▉ | 222/376 [05:12<03:20, 1.30s/it]
|
735 |
+
59%|█████▉ | 223/376 [05:13<03:03, 1.20s/it]
|
736 |
+
60%|█████▉ | 224/376 [05:14<02:50, 1.12s/it]
|
737 |
+
60%|█████▉ | 225/376 [05:15<02:47, 1.11s/it]
|
738 |
+
60%|██████ | 226/376 [05:16<03:02, 1.21s/it]
|
739 |
+
60%|██████ | 227/376 [05:18<03:26, 1.39s/it]
|
740 |
+
61%|██████ | 228/376 [05:20<03:41, 1.50s/it]
|
741 |
+
61%|██████ | 229/376 [05:21<03:29, 1.43s/it]
|
742 |
+
61%|██████ | 230/376 [05:23<03:37, 1.49s/it]
|
743 |
+
|
744 |
+
{'loss': 0.3772, 'learning_rate': 1.640589221739926e-05, 'epoch': 0.61}
|
745 |
+
|
746 |
+
61%|██████ | 230/376 [05:23<03:37, 1.49s/it]
|
747 |
+
61%|██████▏ | 231/376 [05:24<03:26, 1.43s/it]
|
748 |
+
62%|██████▏ | 232/376 [05:25<03:21, 1.40s/it]
|
749 |
+
62%|██████▏ | 233/376 [05:27<03:09, 1.33s/it]
|
750 |
+
62%|██████▏ | 234/376 [05:28<03:33, 1.51s/it]
|
751 |
+
62%|██████▎ | 235/376 [05:30<03:20, 1.42s/it]
|
752 |
+
63%|██████▎ | 236/376 [05:31<03:32, 1.52s/it]
|
753 |
+
63%|██████▎ | 237/376 [05:33<03:17, 1.42s/it]
|
754 |
+
63%|██████▎ | 238/376 [05:34<03:00, 1.31s/it]
|
755 |
+
64%|██████▎ | 239/376 [05:35<03:00, 1.32s/it]
|
756 |
+
64%|██████▍ | 240/376 [05:36<03:04, 1.35s/it]
|
757 |
+
|
758 |
+
{'loss': 0.4403, 'learning_rate': 1.447663093929163e-05, 'epoch': 0.64}
|
759 |
+
|
760 |
+
64%|██████▍ | 240/376 [05:36<03:04, 1.35s/it]
|
761 |
+
64%|██████▍ | 241/376 [05:38<03:02, 1.35s/it]
|
762 |
+
64%|██████▍ | 242/376 [05:39<02:59, 1.34s/it]
|
763 |
+
65%|██████▍ | 243/376 [05:41<03:03, 1.38s/it]
|
764 |
+
65%|██████▍ | 244/376 [05:42<03:05, 1.41s/it]
|
765 |
+
65%|██████▌ | 245/376 [05:43<03:00, 1.38s/it]
|
766 |
+
65%|██████▌ | 246/376 [05:45<02:53, 1.33s/it]
|
767 |
+
66%|██████▌ | 247/376 [05:46<02:41, 1.25s/it]
|
768 |
+
66%|██████▌ | 248/376 [05:47<02:51, 1.34s/it]
|
769 |
+
66%|██████▌ | 249/376 [05:48<02:46, 1.31s/it]
|
770 |
+
66%|██████▋ | 250/376 [05:50<03:02, 1.45s/it]
|
771 |
+
|
772 |
+
{'loss': 0.3867, 'learning_rate': 1.2620791657378664e-05, 'epoch': 0.66}
|
773 |
+
|
774 |
+
66%|██████▋ | 250/376 [05:50<03:02, 1.45s/it]
|
775 |
+
67%|██████▋ | 251/376 [05:51<02:51, 1.37s/it]
|
776 |
+
67%|██████▋ | 252/376 [05:53<02:52, 1.39s/it]
|
777 |
+
67%|██████▋ | 253/376 [05:54<02:50, 1.39s/it]
|
778 |
+
68%|██████▊ | 254/376 [05:56<02:44, 1.35s/it]
|
779 |
+
68%|██████▊ | 255/376 [05:57<02:54, 1.44s/it]
|
780 |
+
68%|██████▊ | 256/376 [05:59<02:54, 1.45s/it]
|
781 |
+
68%|██████▊ | 257/376 [06:00<02:47, 1.41s/it]
|
782 |
+
69%|██████▊ | 258/376 [06:01<02:35, 1.31s/it]
|
783 |
+
69%|██████▉ | 259/376 [06:03<02:42, 1.39s/it]
|
784 |
+
69%|██████▉ | 260/376 [06:04<02:28, 1.28s/it]
|
785 |
+
|
786 |
+
{'loss': 0.3688, 'learning_rate': 1.0851322641735118e-05, 'epoch': 0.69}
|
787 |
+
|
788 |
+
69%|██████▉ | 260/376 [06:04<02:28, 1.28s/it]
|
789 |
+
69%|██████▉ | 261/376 [06:05<02:29, 1.30s/it]
|
790 |
+
70%|██████▉ | 262/376 [06:07<02:51, 1.50s/it]
|
791 |
+
70%|██████▉ | 263/376 [06:09<02:53, 1.54s/it]
|
792 |
+
70%|███████ | 264/376 [06:10<02:53, 1.55s/it]
|
793 |
+
70%|███████ | 265/376 [06:11<02:43, 1.47s/it]
|
794 |
+
71%|███████ | 266/376 [06:13<02:32, 1.38s/it]
|
795 |
+
71%|███████ | 267/376 [06:14<02:40, 1.47s/it]
|
796 |
+
71%|███████▏ | 268/376 [06:16<02:35, 1.44s/it]
|
797 |
+
72%|███████▏ | 269/376 [06:17<02:44, 1.54s/it]
|
798 |
+
72%|███████▏ | 270/376 [06:19<02:40, 1.51s/it]
|
799 |
+
|
800 |
+
{'loss': 0.3655, 'learning_rate': 9.180569553392535e-06, 'epoch': 0.72}
|
801 |
+
|
802 |
+
72%|███████▏ | 270/376 [06:19<02:40, 1.51s/it]
|
803 |
+
72%|███��███▏ | 271/376 [06:20<02:34, 1.47s/it]
|
804 |
+
72%|███████▏ | 272/376 [06:21<02:25, 1.40s/it]
|
805 |
+
73%|███████▎ | 273/376 [06:23<02:24, 1.40s/it]
|
806 |
+
73%|███████▎ | 274/376 [06:25<02:29, 1.47s/it]
|
807 |
+
73%|███████▎ | 275/376 [06:27<02:45, 1.64s/it]
|
808 |
+
73%|███████▎ | 276/376 [06:28<02:32, 1.52s/it]
|
809 |
+
74%|███████▎ | 277/376 [06:29<02:27, 1.49s/it]
|
810 |
+
74%|███████▍ | 278/376 [06:31<02:29, 1.53s/it]
|
811 |
+
74%|███████▍ | 279/376 [06:32<02:21, 1.46s/it]
|
812 |
+
74%|███████▍ | 280/376 [06:33<02:15, 1.41s/it]
|
813 |
+
|
814 |
+
{'loss': 0.4144, 'learning_rate': 7.620189308133943e-06, 'epoch': 0.74}
|
815 |
+
|
816 |
+
74%|███████▍ | 280/376 [06:33<02:15, 1.41s/it]
|
817 |
+
75%|███████▍ | 281/376 [06:35<02:14, 1.41s/it]
|
818 |
+
75%|███████▌ | 282/376 [06:36<02:17, 1.46s/it]
|
819 |
+
75%|███████▌ | 283/376 [06:38<02:16, 1.47s/it]
|
820 |
+
76%|███████▌ | 284/376 [06:39<02:03, 1.34s/it]
|
821 |
+
76%|███████▌ | 285/376 [06:40<01:53, 1.25s/it]
|
822 |
+
76%|███████▌ | 286/376 [06:41<01:49, 1.21s/it]
|
823 |
+
76%|███████▋ | 287/376 [06:43<02:11, 1.48s/it]
|
824 |
+
77%|███████▋ | 288/376 [06:45<02:24, 1.64s/it]
|
825 |
+
77%|███████▋ | 289/376 [06:46<02:06, 1.45s/it]
|
826 |
+
77%|███████▋ | 290/376 [06:47<01:56, 1.35s/it]
|
827 |
+
|
828 |
+
{'loss': 0.3298, 'learning_rate': 6.181068745693716e-06, 'epoch': 0.77}
|
829 |
+
|
830 |
+
77%|███████▋ | 290/376 [06:47<01:56, 1.35s/it]
|
831 |
+
77%|███████▋ | 291/376 [06:48<01:47, 1.27s/it]
|
832 |
+
78%|███████▊ | 292/376 [06:50<01:51, 1.33s/it]
|
833 |
+
78%|███████▊ | 293/376 [06:51<01:42, 1.24s/it]
|
834 |
+
78%|███████▊ | 294/376 [06:52<01:46, 1.29s/it]
|
835 |
+
78%|███████▊ | 295/376 [06:54<01:50, 1.37s/it]
|
836 |
+
79%|███████▊ | 296/376 [06:55<01:46, 1.33s/it]
|
837 |
+
79%|███████▉ | 297/376 [06:56<01:37, 1.24s/it]
|
838 |
+
79%|███████▉ | 298/376 [06:58<01:39, 1.28s/it]
|
839 |
+
80%|███████▉ | 299/376 [06:59<01:34, 1.23s/it]
|
840 |
+
80%|███████▉ | 300/376 [07:00<01:31, 1.20s/it]
|
841 |
+
|
842 |
+
{'loss': 0.3337, 'learning_rate': 4.873248671810928e-06, 'epoch': 0.8}
|
843 |
+
|
844 |
+
80%|███████▉ | 300/376 [07:00<01:31, 1.20s/it]
|
845 |
+
80%|████████ | 301/376 [07:01<01:28, 1.17s/it]
|
846 |
+
80%|████████ | 302/376 [07:03<01:36, 1.31s/it]
|
847 |
+
81%|████████ | 303/376 [07:04<01:30, 1.23s/it]
|
848 |
+
81%|████████ | 304/376 [07:05<01:26, 1.20s/it]
|
849 |
+
81%|████████ | 305/376 [07:06<01:23, 1.18s/it]
|
850 |
+
81%|████████▏ | 306/376 [07:07<01:25, 1.22s/it]
|
851 |
+
82%|████████▏ | 307/376 [07:08<01:20, 1.17s/it]
|
852 |
+
82%|████████▏ | 308/376 [07:10<01:22, 1.22s/it]
|
853 |
+
82%|████████▏ | 309/376 [07:11<01:22, 1.24s/it]
|
854 |
+
82%|████████▏ | 310/376 [07:13<01:35, 1.44s/it]
|
855 |
+
|
856 |
+
{'loss': 0.3217, 'learning_rate': 3.7058538030980942e-06, 'epoch': 0.82}
|
857 |
+
|
858 |
+
82%|████████▏ | 310/376 [07:13<01:35, 1.44s/it]
|
859 |
+
83%|████████▎ | 311/376 [07:14<01:31, 1.41s/it]
|
860 |
+
83%|████████▎ | 312/376 [07:15<01:22, 1.29s/it]
|
861 |
+
83%|████████▎ | 313/376 [07:16<01:23, 1.33s/it]
|
862 |
+
84%|████████▎ | 314/376 [07:18<01:28, 1.43s/it]
|
863 |
+
84%|████████▍ | 315/376 [07:19<01:25, 1.40s/it]
|
864 |
+
84%|████████▍ | 316/376 [07:21<01:17, 1.29s/it]
|
865 |
+
84%|████████▍ | 317/376 [07:22<01:11, 1.21s/it]
|
866 |
+
85%|████████▍ | 318/376 [07:24<01:24, 1.46s/it]
|
867 |
+
85%|████████▍ | 319/376 [07:25<01:19, 1.40s/it]
|
868 |
+
85%|████████▌ | 320/376 [07:26<01:16, 1.36s/it]
|
869 |
+
|
870 |
+
{'loss': 0.3222, 'learning_rate': 2.687029103502972e-06, 'epoch': 0.85}
|
871 |
+
|
872 |
+
85%|████████▌ | 320/376 [07:26<01:16, 1.36s/it]
|
873 |
+
85%|████████▌ | 321/376 [07:27<01:09, 1.26s/it]
|
874 |
+
86%|████████▌ | 322/376 [07:28<01:09, 1.28s/it]
|
875 |
+
86%|████████▌ | 323/376 [07:30<01:12, 1.38s/it]
|
876 |
+
86%|████████▌ | 324/376 [07:32<01:12, 1.40s/it]
|
877 |
+
86%|████████▋ | 325/376 [07:33<01:11, 1.41s/it]
|
878 |
+
87%|████████▋ | 326/376 [07:35<01:15, 1.50s/it]
|
879 |
+
87%|████████▋ | 327/376 [07:36<01:12, 1.49s/it]
|
880 |
+
87%|████████▋ | 328/376 [07:38<01:12, 1.50s/it]
|
881 |
+
88%|████████▊ | 329/376 [07:39<01:10, 1.50s/it]
|
882 |
+
88%|████████▊ | 330/376 [07:41<01:10, 1.53s/it]
|
883 |
+
|
884 |
+
{'loss': 0.3989, 'learning_rate': 1.823882956546566e-06, 'epoch': 0.88}
|
885 |
+
|
886 |
+
88%|████████▊ | 330/376 [07:41<01:10, 1.53s/it]
|
887 |
+
88%|████████▊ | 331/376 [07:42<01:10, 1.57s/it]
|
888 |
+
88%|████████▊ | 332/376 [07:44<01:09, 1.59s/it]
|
889 |
+
89%|████████▊ | 333/376 [07:45<01:06, 1.54s/it]
|
890 |
+
89%|████████▉ | 334/376 [07:47<01:02, 1.48s/it]
|
891 |
+
89%|████████▉ | 335/376 [07:48<01:00, 1.48s/it]
|
892 |
+
89%|████████▉ | 336/376 [07:49<00:53, 1.33s/it]
|
893 |
+
90%|████████▉ | 337/376 [07:50<00:50, 1.29s/it]
|
894 |
+
90%|████████▉ | 338/376 [07:52<00:48, 1.29s/it]
|
895 |
+
90%|█████████ | 339/376 [07:53<00:49, 1.34s/it]
|
896 |
+
90%|█████████ | 340/376 [07:55<00:48, 1.34s/it]
|
897 |
+
|
898 |
+
{'loss': 0.3805, 'learning_rate': 1.1224375698271894e-06, 'epoch': 0.9}
|
899 |
+
|
900 |
+
90%|█████████ | 340/376 [07:55<00:48, 1.34s/it]
|
901 |
+
91%|█████████ | 341/376 [07:56<00:48, 1.38s/it]
|
902 |
+
91%|█████████ | 342/376 [07:57<00:43, 1.28s/it]
|
903 |
+
91%|█████████ | 343/376 [07:59<00:47, 1.45s/it]
|
904 |
+
91%|█████████▏| 344/376 [08:00<00:47, 1.49s/it]
|
905 |
+
92%|█████████▏| 345/376 [08:02<00:45, 1.46s/it]
|
906 |
+
92%|█████████▏| 346/376 [08:03<00:40, 1.34s/it]
|
907 |
+
92%|█████████▏| 347/376 [08:04<00:40, 1.39s/it]
|
908 |
+
93%|█████████▎| 348/376 [08:06<00:36, 1.30s/it]
|
909 |
+
93%|█████████▎| 349/376 [08:07<00:34, 1.28s/it]
|
910 |
+
93%|█████████▎| 350/376 [08:08<00:34, 1.31s/it]
|
911 |
+
|
912 |
+
{'loss': 0.4108, 'learning_rate': 5.875869578203824e-07, 'epoch': 0.93}
|
913 |
+
|
914 |
+
93%|█████████▎| 350/376 [08:08<00:34, 1.31s/it]
|
915 |
+
93%|█████████▎| 351/376 [08:10<00:34, 1.38s/it]
|
916 |
+
94%|█████████▎| 352/376 [08:11<00:31, 1.32s/it]
|
917 |
+
94%|█████████▍| 353/376 [08:12<00:28, 1.26s/it]
|
918 |
+
94%|█████████▍| 354/376 [08:13<00:28, 1.29s/it]
|
919 |
+
94%|█████████▍| 355/376 [08:14<00:25, 1.23s/it]
|
920 |
+
95%|█████████▍| 356/376 [08:16<00:23, 1.19s/it]
|
921 |
+
95%|█████████▍| 357/376 [08:17<00:25, 1.35s/it]
|
922 |
+
95%|█████████▌| 358/376 [08:19<00:26, 1.47s/it]
|
923 |
+
95%|█████████▌| 359/376 [08:20<00:22, 1.32s/it]
|
924 |
+
96%|█████████▌| 360/376 [08:21<00:21, 1.37s/it]
|
925 |
+
|
926 |
+
{'loss': 0.3578, 'learning_rate': 2.230627961304993e-07, 'epoch': 0.96}
|
927 |
+
|
928 |
+
96%|█████████▌| 360/376 [08:22<00:21, 1.37s/it]
|
929 |
+
96%|█████████▌| 361/376 [08:23<00:20, 1.34s/it]
|
930 |
+
96%|█████████▋| 362/376 [08:24<00:19, 1.39s/it]
|
931 |
+
97%|█████████▋| 363/376 [08:26<00:18, 1.40s/it]
|
932 |
+
97%|█████████▋| 364/376 [08:27<00:16, 1.36s/it]
|
933 |
+
97%|█████████▋| 365/376 [08:29<00:16, 1.46s/it]
|
934 |
+
97%|█████████▋| 366/376 [08:30<00:14, 1.45s/it]
|
935 |
+
98%|█████████▊| 367/376 [08:32<00:13, 1.46s/it]
|
936 |
+
98%|█████████▊| 368/376 [08:33<00:11, 1.46s/it]
|
937 |
+
98%|█████████▊| 369/376 [08:35<00:10, 1.48s/it]
|
938 |
+
98%|█████████▊| 370/376 [08:36<00:08, 1.39s/it]
|
939 |
+
|
940 |
+
{'loss': 0.3453, 'learning_rate': 3.1408385430356516e-08, 'epoch': 0.98}
|
941 |
+
|
942 |
+
98%|█████████▊| 370/376 [08:36<00:08, 1.39s/it]
|
943 |
+
99%|█████████▊| 371/376 [08:37<00:06, 1.31s/it]
|
944 |
+
99%|█████████▉| 372/376 [08:38<00:05, 1.35s/it]
|
945 |
+
99%|█████████▉| 373/376 [08:40<00:04, 1.35s/it]
|
946 |
+
99%|█████████▉| 374/376 [08:41<00:02, 1.41s/it]
|
947 |
+
100%|█████████▉| 375/376 [08:43<00:01, 1.47s/it]
|
948 |
+
100%|██████████| 376/376 [08:44<00:00, 1.47s/it][INFO|trainer.py:3166] 2024-01-04 10:02:58,683 >> ***** Running Evaluation *****
|
949 |
+
[INFO|trainer.py:3168] 2024-01-04 10:02:58,683 >> Num examples = 335
|
950 |
+
[INFO|trainer.py:3171] 2024-01-04 10:02:58,683 >> Batch size = 1
|
951 |
+
|
952 |
+
|
953 |
+
0%| | 0/168 [00:00<?, ?it/s][A
|
954 |
+
|
955 |
+
1%| | 2/168 [00:00<00:16, 10.08it/s][A
|
956 |
+
|
957 |
+
2%|▏ | 4/168 [00:00<00:21, 7.51it/s][A
|
958 |
+
|
959 |
+
3%|▎ | 5/168 [00:00<00:27, 5.84it/s][A
|
960 |
+
|
961 |
+
4%|▎ | 6/168 [00:00<00:24, 6.53it/s][A
|
962 |
+
|
963 |
+
4%|▍ | 7/168 [00:01<00:28, 5.68it/s][A
|
964 |
+
|
965 |
+
5%|▍ | 8/168 [00:01<00:27, 5.80it/s][A
|
966 |
+
|
967 |
+
5%|▌ | 9/168 [00:01<00:31, 5.02it/s][A
|
968 |
+
|
969 |
+
6%|▌ | 10/168 [00:01<00:33, 4.74it/s][A
|
970 |
+
|
971 |
+
7%|▋ | 11/168 [00:01<00:29, 5.33it/s][A
|
972 |
+
|
973 |
+
7%|▋ | 12/168 [00:02<00:32, 4.81it/s][A
|
974 |
+
|
975 |
+
8%|▊ | 13/168 [00:02<00:28, 5.35it/s][A
|
976 |
+
|
977 |
+
8%|▊ | 14/168 [00:02<00:26, 5.85it/s][A
|
978 |
+
|
979 |
+
9%|▉ | 15/168 [00:02<00:29, 5.24it/s][A
|
980 |
+
|
981 |
+
10%|▉ | 16/168 [00:02<00:27, 5.51it/s][A
|
982 |
+
|
983 |
+
10%|█ | 17/168 [00:02<00:25, 5.91it/s][A
|
984 |
+
|
985 |
+
11%|█ | 18/168 [00:03<00:24, 6.22it/s][A
|
986 |
+
|
987 |
+
11%|█▏ | 19/168 [00:03<00:23, 6.44it/s][A
|
988 |
+
|
989 |
+
12%|█▏ | 20/168 [00:03<00:24, 6.13it/s][A
|
990 |
+
|
991 |
+
12%|█▎ | 21/168 [00:03<00:28, 5.17it/s][A
|
992 |
+
|
993 |
+
13%|█▎ | 22/168 [00:03<00:26, 5.44it/s][A
|
994 |
+
|
995 |
+
14%|█▍ | 24/168 [00:04<00:21, 6.73it/s][A
|
996 |
+
|
997 |
+
15%|█▍ | 25/168 [00:04<00:20, 7.15it/s][A
|
998 |
+
|
999 |
+
15%|█▌ | 26/168 [00:04<00:20, 6.95it/s][A
|
1000 |
+
|
1001 |
+
16%|█▌ | 27/168 [00:04<00:24, 5.74it/s][A
|
1002 |
+
|
1003 |
+
17%|█▋ | 28/168 [00:04<00:23, 6.08it/s][A
|
1004 |
+
|
1005 |
+
17%|█▋ | 29/168 [00:04<00:21, 6.38it/s][A
|
1006 |
+
|
1007 |
+
18%|█▊ | 30/168 [00:05<00:21, 6.48it/s][A
|
1008 |
+
|
1009 |
+
18%|█▊ | 31/168 [00:05<00:20, 6.71it/s][A
|
1010 |
+
|
1011 |
+
19%|█▉ | 32/168 [00:05<00:20, 6.76it/s][A
|
1012 |
+
|
1013 |
+
20%|██ | 34/168 [00:05<00:18, 7.39it/s][A
|
1014 |
+
|
1015 |
+
21%|██ | 35/168 [00:05<00:18, 7.31it/s][A
|
1016 |
+
|
1017 |
+
21%|██▏ | 36/168 [00:05<00:17, 7.68it/s][A
|
1018 |
+
|
1019 |
+
22%|██▏ | 37/168 [00:05<00:17, 7.58it/s][A
|
1020 |
+
|
1021 |
+
23%|██▎ | 38/168 [00:06<00:17, 7.44it/s][A
|
1022 |
+
|
1023 |
+
23%|██▎ | 39/168 [00:06<00:20, 6.29it/s][A
|
1024 |
+
|
1025 |
+
24%|██▍ | 40/168 [00:06<00:20, 6.31it/s][A
|
1026 |
+
|
1027 |
+
24%|██▍ | 41/168 [00:06<00:20, 6.31it/s][A
|
1028 |
+
|
1029 |
+
25%|██▌ | 42/168 [00:06<00:18, 6.93it/s][A
|
1030 |
+
|
1031 |
+
26%|██▌ | 43/168 [00:06<00:20, 5.98it/s][A
|
1032 |
+
|
1033 |
+
26%|██▌ | 44/168 [00:07<00:19, 6.44it/s][A
|
1034 |
+
|
1035 |
+
27%|██▋ | 45/168 [00:07<00:21, 5.70it/s][A
|
1036 |
+
|
1037 |
+
27%|██▋ | 46/168 [00:07<00:23, 5.26it/s][A
|
1038 |
+
|
1039 |
+
28%|██▊ | 47/168 [00:07<00:25, 4.74it/s][A
|
1040 |
+
|
1041 |
+
29%|██▊ | 48/168 [00:07<00:22, 5.29it/s][A
|
1042 |
+
|
1043 |
+
29%|██▉ | 49/168 [00:08<00:21, 5.65it/s][A
|
1044 |
+
|
1045 |
+
30%|██▉ | 50/168 [00:08<00:19, 6.09it/s][A
|
1046 |
+
|
1047 |
+
30%|███ | 51/168 [00:08<00:21, 5.46it/s][A
|
1048 |
+
|
1049 |
+
31%|███ | 52/168 [00:08<00:20, 5.70it/s][A
|
1050 |
+
|
1051 |
+
32%|███▏ | 53/168 [00:08<00:22, 5.02it/s][A
|
1052 |
+
|
1053 |
+
32%|███▏ | 54/168 [00:09<00:22, 5.11it/s][A
|
1054 |
+
|
1055 |
+
33%|███▎ | 55/168 [00:09<00:21, 5.22it/s][A
|
1056 |
+
|
1057 |
+
33%|███▎ | 56/168 [00:09<00:19, 5.75it/s][A
|
1058 |
+
|
1059 |
+
34%|███▍ | 57/168 [00:09<00:17, 6.50it/s][A
|
1060 |
+
|
1061 |
+
35%|███▍ | 58/168 [00:09<00:18, 5.82it/s][A
|
1062 |
+
|
1063 |
+
35%|███▌ | 59/168 [00:09<00:21, 5.01it/s][A
|
1064 |
+
|
1065 |
+
36%|███▌ | 60/168 [00:10<00:19, 5.44it/s][A
|
1066 |
+
|
1067 |
+
36%|███▋ | 61/168 [00:10<00:17, 6.24it/s][A
|
1068 |
+
|
1069 |
+
37%|███▋ | 62/168 [00:10<00:15, 6.80it/s][A
|
1070 |
+
|
1071 |
+
38%|███▊ | 63/168 [00:10<00:15, 6.99it/s][A
|
1072 |
+
|
1073 |
+
38%|███▊ | 64/168 [00:10<00:14, 7.14it/s][A
|
1074 |
+
|
1075 |
+
39%|███▊ | 65/168 [00:10<00:14, 7.02it/s][A
|
1076 |
+
|
1077 |
+
39%|███▉ | 66/168 [00:10<00:13, 7.43it/s][A
|
1078 |
+
|
1079 |
+
40%|███▉ | 67/168 [00:11<00:17, 5.85it/s][A
|
1080 |
+
|
1081 |
+
40%|████ | 68/168 [00:11<00:19, 5.09it/s][A
|
1082 |
+
|
1083 |
+
41%|████ | 69/168 [00:11<00:17, 5.57it/s][A
|
1084 |
+
|
1085 |
+
42%|████▏ | 70/168 [00:11<00:15, 6.14it/s][A
|
1086 |
+
|
1087 |
+
42%|████▏ | 71/168 [00:11<00:14, 6.50it/s][A
|
1088 |
+
|
1089 |
+
43%|████▎ | 72/168 [00:11<00:16, 5.78it/s][A
|
1090 |
+
|
1091 |
+
43%|████▎ | 73/168 [00:12<00:19, 5.00it/s][A
|
1092 |
+
|
1093 |
+
44%|████▍ | 74/168 [00:12<00:19, 4.95it/s][A
|
1094 |
+
|
1095 |
+
45%|████▍ | 75/168 [00:12<00:18, 5.12it/s][A
|
1096 |
+
|
1097 |
+
46%|████▌ | 77/168 [00:12<00:13, 6.52it/s][A
|
1098 |
+
|
1099 |
+
46%|████▋ | 78/168 [00:12<00:12, 7.01it/s][A
|
1100 |
+
|
1101 |
+
47%|████▋ | 79/168 [00:13<00:12, 7.09it/s][A
|
1102 |
+
|
1103 |
+
48%|████▊ | 80/168 [00:13<00:13, 6.46it/s][A
|
1104 |
+
|
1105 |
+
48%|████▊ | 81/168 [00:13<00:13, 6.39it/s][A
|
1106 |
+
|
1107 |
+
49%|████▉ | 82/168 [00:13<00:13, 6.33it/s][A
|
1108 |
+
|
1109 |
+
49%|████▉ | 83/168 [00:13<00:13, 6.39it/s][A
|
1110 |
+
|
1111 |
+
50%|█████ | 84/168 [00:13<00:14, 5.64it/s][A
|
1112 |
+
|
1113 |
+
51%|█████ | 85/168 [00:14<00:16, 5.00it/s][A
|
1114 |
+
|
1115 |
+
51%|█████ | 86/168 [00:14<00:14, 5.55it/s][A
|
1116 |
+
|
1117 |
+
52%|█████▏ | 87/168 [00:14<00:13, 6.03it/s][A
|
1118 |
+
|
1119 |
+
52%|█████▏ | 88/168 [00:14<00:12, 6.65it/s][A
|
1120 |
+
|
1121 |
+
53%|█████▎ | 89/168 [00:14<00:12, 6.47it/s][A
|
1122 |
+
|
1123 |
+
54%|█████▎ | 90/168 [00:14<00:11, 6.53it/s][A
|
1124 |
+
|
1125 |
+
54%|█████▍ | 91/168 [00:15<00:11, 6.55it/s][A
|
1126 |
+
|
1127 |
+
55%|█████▍ | 92/168 [00:15<00:10, 7.11it/s][A
|
1128 |
+
|
1129 |
+
55%|█████▌ | 93/168 [00:15<00:10, 7.24it/s][A
|
1130 |
+
|
1131 |
+
56%|█████▌ | 94/168 [00:15<00:10, 7.32it/s][A
|
1132 |
+
|
1133 |
+
57%|█████▋ | 95/168 [00:15<00:09, 7.37it/s][A
|
1134 |
+
|
1135 |
+
57%|█████▋ | 96/168 [00:15<00:12, 5.76it/s][A
|
1136 |
+
|
1137 |
+
58%|█████▊ | 97/168 [00:16<00:13, 5.12it/s][A
|
1138 |
+
|
1139 |
+
58%|█████▊ | 98/168 [00:16<00:14, 4.86it/s][A
|
1140 |
+
|
1141 |
+
59%|█████▉ | 99/168 [00:16<00:12, 5.42it/s][A
|
1142 |
+
|
1143 |
+
60%|█████▉ | 100/168 [00:16<00:11, 5.69it/s][A
|
1144 |
+
|
1145 |
+
60%|██████ | 101/168 [00:16<00:10, 6.38it/s][A
|
1146 |
+
|
1147 |
+
61%|██████ | 102/168 [00:16<00:10, 6.58it/s][A
|
1148 |
+
|
1149 |
+
61%|██████▏ | 103/168 [00:16<00:09, 7.16it/s][A
|
1150 |
+
|
1151 |
+
62%|██████▏ | 104/168 [00:17<00:11, 5.74it/s][A
|
1152 |
+
|
1153 |
+
62%|██████▎ | 105/168 [00:17<00:10, 6.08it/s][A
|
1154 |
+
|
1155 |
+
63%|██████▎ | 106/168 [00:17<00:09, 6.21it/s][A
|
1156 |
+
|
1157 |
+
64%|██████▎ | 107/168 [00:17<00:09, 6.45it/s][A
|
1158 |
+
|
1159 |
+
64%|██████▍ | 108/168 [00:17<00:09, 6.56it/s][A
|
1160 |
+
|
1161 |
+
65%|██████▍ | 109/168 [00:17<00:09, 6.31it/s][A
|
1162 |
+
|
1163 |
+
65%|██████▌ | 110/168 [00:18<00:08, 6.89it/s][A
|
1164 |
+
|
1165 |
+
66%|██████▌ | 111/168 [00:18<00:08, 6.36it/s][A
|
1166 |
+
|
1167 |
+
67%|██████▋ | 112/168 [00:18<00:08, 6.27it/s][A
|
1168 |
+
|
1169 |
+
67%|██████▋ | 113/168 [00:18<00:09, 6.00it/s][A
|
1170 |
+
|
1171 |
+
68%|██████▊ | 114/168 [00:18<00:08, 6.33it/s][A
|
1172 |
+
|
1173 |
+
68%|██████▊ | 115/168 [00:18<00:07, 6.64it/s][A
|
1174 |
+
|
1175 |
+
69%|██████▉ | 116/168 [00:19<00:08, 5.88it/s][A
|
1176 |
+
|
1177 |
+
70%|██████▉ | 117/168 [00:19<00:08, 6.22it/s][A
|
1178 |
+
|
1179 |
+
70%|███████ | 118/168 [00:19<00:08, 6.16it/s][A
|
1180 |
+
|
1181 |
+
71%|███████ | 119/168 [00:19<00:08, 5.46it/s][A
|
1182 |
+
|
1183 |
+
71%|███████▏ | 120/168 [00:19<00:09, 4.83it/s][A
|
1184 |
+
|
1185 |
+
72%|███████▏ | 121/168 [00:20<00:09, 5.00it/s][A
|
1186 |
+
|
1187 |
+
73%|███████▎ | 122/168 [00:20<00:09, 4.75it/s][A
|
1188 |
+
|
1189 |
+
73%|███████▎ | 123/168 [00:20<00:08, 5.45it/s][A
|
1190 |
+
|
1191 |
+
74%|███████▍ | 124/168 [00:20<00:08, 5.04it/s][A
|
1192 |
+
|
1193 |
+
74%|███████▍ | 125/168 [00:20<00:07, 5.50it/s][A
|
1194 |
+
|
1195 |
+
75%|███████▌ | 126/168 [00:20<00:06, 6.18it/s][A
|
1196 |
+
|
1197 |
+
76%|███████▌ | 127/168 [00:21<00:06, 6.57it/s][A
|
1198 |
+
|
1199 |
+
76%|███████▌ | 128/168 [00:21<00:07, 5.37it/s][A
|
1200 |
+
|
1201 |
+
77%|███████▋ | 129/168 [00:21<00:07, 5.42it/s][A
|
1202 |
+
|
1203 |
+
77%|███████▋ | 130/168 [00:21<00:07, 4.82it/s][A
|
1204 |
+
|
1205 |
+
78%|███████▊ | 131/168 [00:21<00:06, 5.58it/s][A
|
1206 |
+
|
1207 |
+
79%|███████▊ | 132/168 [00:22<00:06, 5.73it/s][A
|
1208 |
+
|
1209 |
+
79%|███████▉ | 133/168 [00:22<00:06, 5.66it/s][A
|
1210 |
+
|
1211 |
+
80%|███████▉ | 134/168 [00:22<00:05, 6.34it/s][A
|
1212 |
+
|
1213 |
+
80%|████████ | 135/168 [00:22<00:05, 6.50it/s][A
|
1214 |
+
|
1215 |
+
81%|████████ | 136/168 [00:22<00:04, 7.06it/s][A
|
1216 |
+
|
1217 |
+
82%|████████▏ | 137/168 [00:22<00:05, 5.69it/s][A
|
1218 |
+
|
1219 |
+
82%|████████▏ | 138/168 [00:23<00:05, 5.86it/s][A
|
1220 |
+
|
1221 |
+
83%|████████▎ | 139/168 [00:23<00:04, 6.27it/s][A
|
1222 |
+
|
1223 |
+
83%|████████▎ | 140/168 [00:23<00:04, 6.96it/s][A
|
1224 |
+
|
1225 |
+
84%|████████▍ | 141/168 [00:23<00:03, 7.29it/s][A
|
1226 |
+
|
1227 |
+
85%|████████▍ | 142/168 [00:23<00:03, 7.25it/s][A
|
1228 |
+
|
1229 |
+
85%|████████▌ | 143/168 [00:23<00:04, 5.77it/s][A
|
1230 |
+
|
1231 |
+
86%|████████▌ | 144/168 [00:24<00:04, 5.05it/s][A
|
1232 |
+
|
1233 |
+
86%|████████▋ | 145/168 [00:24<00:04, 5.33it/s][A
|
1234 |
+
|
1235 |
+
87%|████████▋ | 146/168 [00:24<00:03, 5.79it/s][A
|
1236 |
+
|
1237 |
+
88%|████████▊ | 147/168 [00:24<00:03, 6.15it/s][A
|
1238 |
+
|
1239 |
+
88%|████████▊ | 148/168 [00:24<00:02, 6.81it/s][A
|
1240 |
+
|
1241 |
+
89%|████████▊ | 149/168 [00:24<00:03, 5.49it/s][A
|
1242 |
+
|
1243 |
+
90%|████████▉ | 151/168 [00:25<00:03, 5.66it/s][A
|
1244 |
+
|
1245 |
+
91%|█████████ | 153/168 [00:25<00:02, 5.77it/s][A
|
1246 |
+
|
1247 |
+
92%|█████████▏| 154/168 [00:25<00:02, 6.05it/s][A
|
1248 |
+
|
1249 |
+
92%|█████████▏| 155/168 [00:25<00:02, 5.80it/s][A
|
1250 |
+
|
1251 |
+
93%|█████████▎| 156/168 [00:26<00:02, 5.39it/s][A
|
1252 |
+
|
1253 |
+
93%|█████████▎| 157/168 [00:26<00:02, 5.36it/s][A
|
1254 |
+
|
1255 |
+
94%|█████████▍| 158/168 [00:26<00:01, 6.07it/s][A
|
1256 |
+
|
1257 |
+
95%|█████████▍| 159/168 [00:26<00:01, 6.11it/s][A
|
1258 |
+
|
1259 |
+
95%|█████████▌| 160/168 [00:26<00:01, 5.82it/s][A
|
1260 |
+
|
1261 |
+
96%|█████████▌| 161/168 [00:26<00:01, 5.74it/s][A
|
1262 |
+
|
1263 |
+
96%|█████████▋| 162/168 [00:27<00:01, 5.98it/s][A
|
1264 |
+
|
1265 |
+
97%|█████████▋| 163/168 [00:27<00:00, 5.17it/s][A
|
1266 |
+
|
1267 |
+
98%|█████████▊| 164/168 [00:27<00:00, 5.61it/s][A
|
1268 |
+
|
1269 |
+
98%|█████████▊| 165/168 [00:27<00:00, 6.31it/s][A
|
1270 |
+
|
1271 |
+
99%|█████████▉| 166/168 [00:27<00:00, 6.78it/s][A
|
1272 |
+
|
1273 |
+
100%|██████████| 168/168 [00:28<00:00, 6.50it/s][A
|
1274 |
+
|
1275 |
+
|
1276 |
+
|
1277 |
+
[A{'eval_loss': 0.35242682695388794, 'eval_runtime': 28.2403, 'eval_samples_per_second': 11.862, 'eval_steps_per_second': 5.949, 'epoch': 1.0}
|
1278 |
+
|
1279 |
+
100%|██████████| 376/376 [09:13<00:00, 1.47s/it]
|
1280 |
+
|
1281 |
+
100%|██████████| 168/168 [00:28<00:00, 6.50it/s][A
|
1282 |
+
|
1283 |
+
[A[INFO|trainer.py:1947] 2024-01-04 10:03:26,926 >>
|
1284 |
+
|
1285 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
1286 |
+
|
1287 |
+
|
1288 |
+
|
1289 |
+
|
1290 |
+
{'train_runtime': 553.4721, 'train_samples_per_second': 5.44, 'train_steps_per_second': 0.679, 'train_loss': 0.4441075046011742, 'epoch': 1.0}
|
1291 |
+
|
1292 |
+
100%|██████████| 376/376 [09:13<00:00, 1.47s/it]
|
1293 |
+
100%|██████████| 376/376 [09:13<00:00, 1.47s/it]
|
1294 |
+
[INFO|trainer.py:2889] 2024-01-04 10:03:26,930 >> Saving model checkpoint to ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora
|
1295 |
+
[INFO|tokenization_utils_base.py:2432] 2024-01-04 10:03:26,973 >> tokenizer config file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/tokenizer_config.json
|
1296 |
+
[INFO|tokenization_utils_base.py:2441] 2024-01-04 10:03:26,974 >> Special tokens file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/special_tokens_map.json
|
1297 |
+
[INFO|tokenization_utils_base.py:2492] 2024-01-04 10:03:26,974 >> added tokens file saved in ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/added_tokens.json
|
1298 |
+
***** train metrics *****
|
1299 |
+
epoch = 1.0
|
1300 |
+
train_loss = 0.4441
|
1301 |
+
train_runtime = 0:09:13.47
|
1302 |
+
train_samples_per_second = 5.44
|
1303 |
+
train_steps_per_second = 0.679
|
1304 |
+
Figure saved: ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/training_loss.png
|
1305 |
+
Figure saved: ./models/sft/dolphin-2_6-phi-2-sft-glaive-function-calling-v2-ep1-lora/training_eval_loss.png
|
1306 |
+
[INFO|trainer.py:3166] 2024-01-04 10:03:27,895 >> ***** Running Evaluation *****
|
1307 |
+
[INFO|trainer.py:3168] 2024-01-04 10:03:27,895 >> Num examples = 335
|
1308 |
+
[INFO|trainer.py:3171] 2024-01-04 10:03:27,895 >> Batch size = 1
|
1309 |
+
|
1310 |
+
0%| | 0/168 [00:00<?, ?it/s]
|
1311 |
+
1%| | 2/168 [00:00<00:16, 10.31it/s]
|
1312 |
+
2%|▏ | 4/168 [00:00<00:21, 7.58it/s]
|
1313 |
+
3%|▎ | 5/168 [00:00<00:27, 5.85it/s]
|
1314 |
+
4%|▎ | 6/168 [00:00<00:24, 6.54it/s]
|
1315 |
+
4%|▍ | 7/168 [00:01<00:28, 5.68it/s]
|
1316 |
+
5%|▍ | 8/168 [00:01<00:27, 5.80it/s]
|
1317 |
+
5%|▌ | 9/168 [00:01<00:31, 5.01it/s]
|
1318 |
+
6%|▌ | 10/168 [00:01<00:33, 4.73it/s]
|
1319 |
+
7%|▋ | 11/168 [00:01<00:29, 5.32it/s]
|
1320 |
+
7%|▋ | 12/168 [00:02<00:32, 4.81it/s]
|
1321 |
+
8%|▊ | 13/168 [00:02<00:29, 5.34it/s]
|
1322 |
+
8%|▊ | 14/168 [00:02<00:26, 5.85it/s]
|
1323 |
+
9%|▉ | 15/168 [00:02<00:29, 5.23it/s]
|
1324 |
+
10%|▉ | 16/168 [00:02<00:27, 5.50it/s]
|
1325 |
+
10%|█ | 17/168 [00:02<00:25, 5.90it/s]
|
1326 |
+
11%|█ | 18/168 [00:03<00:24, 6.21it/s]
|
1327 |
+
11%|█▏ | 19/168 [00:03<00:23, 6.45it/s]
|
1328 |
+
12%|█▏ | 20/168 [00:03<00:24, 6.14it/s]
|
1329 |
+
12%|█▎ | 21/168 [00:03<00:28, 5.19it/s]
|
1330 |
+
13%|█▎ | 22/168 [00:03<00:26, 5.46it/s]
|
1331 |
+
14%|█▍ | 24/168 [00:04<00:21, 6.74it/s]
|
1332 |
+
15%|█▍ | 25/168 [00:04<00:19, 7.15it/s]
|
1333 |
+
15%|█▌ | 26/168 [00:04<00:20, 6.95it/s]
|
1334 |
+
16%|█▌ | 27/168 [00:04<00:24, 5.74it/s]
|
1335 |
+
17%|█▋ | 28/168 [00:04<00:23, 6.08it/s]
|
1336 |
+
17%|█▋ | 29/168 [00:04<00:21, 6.38it/s]
|
1337 |
+
18%|█▊ | 30/168 [00:05<00:21, 6.49it/s]
|
1338 |
+
18%|█▊ | 31/168 [00:05<00:20, 6.69it/s]
|
1339 |
+
19%|█▉ | 32/168 [00:05<00:20, 6.76it/s]
|
1340 |
+
20%|██ | 34/168 [00:05<00:18, 7.39it/s]
|
1341 |
+
21%|██ | 35/168 [00:05<00:18, 7.31it/s]
|
1342 |
+
21%|██▏ | 36/168 [00:05<00:17, 7.68it/s]
|
1343 |
+
22%|██▏ | 37/168 [00:05<00:17, 7.57it/s]
|
1344 |
+
23%|██▎ | 38/168 [00:06<00:17, 7.43it/s]
|
1345 |
+
23%|██▎ | 39/168 [00:06<00:20, 6.28it/s]
|
1346 |
+
24%|██▍ | 40/168 [00:06<00:20, 6.31it/s]
|
1347 |
+
24%|██▍ | 41/168 [00:06<00:20, 6.30it/s]
|
1348 |
+
25%|██▌ | 42/168 [00:06<00:18, 6.89it/s]
|
1349 |
+
26%|██▌ | 43/168 [00:06<00:20, 5.96it/s]
|
1350 |
+
26%|██▌ | 44/168 [00:07<00:19, 6.46it/s]
|
1351 |
+
27%|██▋ | 45/168 [00:07<00:21, 5.72it/s]
|
1352 |
+
27%|██▋ | 46/168 [00:07<00:23, 5.28it/s]
|
1353 |
+
28%|██▊ | 47/168 [00:07<00:25, 4.73it/s]
|
1354 |
+
29%|██▊ | 48/168 [00:07<00:22, 5.28it/s]
|
1355 |
+
29%|██▉ | 49/168 [00:08<00:21, 5.64it/s]
|
1356 |
+
30%|██▉ | 50/168 [00:08<00:19, 6.08it/s]
|
1357 |
+
30%|███ | 51/168 [00:08<00:21, 5.42it/s]
|
1358 |
+
31%|███ | 52/168 [00:08<00:20, 5.68it/s]
|
1359 |
+
32%|███▏ | 53/168 [00:08<00:22, 5.00it/s]
|
1360 |
+
32%|███▏ | 54/168 [00:09<00:22, 5.10it/s]
|
1361 |
+
33%|███▎ | 55/168 [00:09<00:21, 5.22it/s]
|
1362 |
+
33%|███▎ | 56/168 [00:09<00:19, 5.75it/s]
|
1363 |
+
34%|███▍ | 57/168 [00:09<00:17, 6.50it/s]
|
1364 |
+
35%|███▍ | 58/168 [00:09<00:18, 5.82it/s]
|
1365 |
+
35%|███▌ | 59/168 [00:09<00:21, 5.01it/s]
|
1366 |
+
36%|███▌ | 60/168 [00:10<00:19, 5.43it/s]
|
1367 |
+
36%|███▋ | 61/168 [00:10<00:17, 6.23it/s]
|
1368 |
+
37%|███▋ | 62/168 [00:10<00:15, 6.80it/s]
|
1369 |
+
38%|███▊ | 63/168 [00:10<00:15, 6.98it/s]
|
1370 |
+
38%|███▊ | 64/168 [00:10<00:14, 7.13it/s]
|
1371 |
+
39%|███▊ | 65/168 [00:10<00:14, 6.99it/s]
|
1372 |
+
39%|███▉ | 66/168 [00:10<00:13, 7.40it/s]
|
1373 |
+
40%|███▉ | 67/168 [00:11<00:17, 5.83it/s]
|
1374 |
+
40%|████ | 68/168 [00:11<00:19, 5.08it/s]
|
1375 |
+
41%|████ | 69/168 [00:11<00:17, 5.55it/s]
|
1376 |
+
42%|████▏ | 70/168 [00:11<00:15, 6.15it/s]
|
1377 |
+
42%|████▏ | 71/168 [00:11<00:14, 6.50it/s]
|
1378 |
+
43%|████▎ | 72/168 [00:11<00:16, 5.78it/s]
|
1379 |
+
43%|████▎ | 73/168 [00:12<00:19, 4.99it/s]
|
1380 |
+
44%|████▍ | 74/168 [00:12<00:19, 4.94it/s]
|
1381 |
+
45%|████▍ | 75/168 [00:12<00:18, 5.11it/s]
|
1382 |
+
46%|████▌ | 77/168 [00:12<00:13, 6.51it/s]
|
1383 |
+
46%|████▋ | 78/168 [00:12<00:12, 7.02it/s]
|
1384 |
+
47%|████▋ | 79/168 [00:13<00:12, 7.07it/s]
|
1385 |
+
48%|████▊ | 80/168 [00:13<00:13, 6.42it/s]
|
1386 |
+
48%|████▊ | 81/168 [00:13<00:13, 6.36it/s]
|
1387 |
+
49%|████▉ | 82/168 [00:13<00:13, 6.30it/s]
|
1388 |
+
49%|████▉ | 83/168 [00:13<00:13, 6.38it/s]
|
1389 |
+
50%|█████ | 84/168 [00:13<00:14, 5.63it/s]
|
1390 |
+
51%|█████ | 85/168 [00:14<00:16, 4.99it/s]
|
1391 |
+
51%|█████ | 86/168 [00:14<00:14, 5.53it/s]
|
1392 |
+
52%|█████▏ | 87/168 [00:14<00:13, 6.01it/s]
|
1393 |
+
52%|█████▏ | 88/168 [00:14<00:12, 6.64it/s]
|
1394 |
+
53%|█████▎ | 89/168 [00:14<00:12, 6.46it/s]
|
1395 |
+
54%|█████▎ | 90/168 [00:14<00:11, 6.51it/s]
|
1396 |
+
54%|█████▍ | 91/168 [00:15<00:11, 6.53it/s]
|
1397 |
+
55%|█████▍ | 92/168 [00:15<00:10, 7.09it/s]
|
1398 |
+
55%|█████▌ | 93/168 [00:15<00:10, 7.20it/s]
|
1399 |
+
56%|█████▌ | 94/168 [00:15<00:10, 7.29it/s]
|
1400 |
+
57%|█████▋ | 95/168 [00:15<00:09, 7.34it/s]
|
1401 |
+
57%|█████▋ | 96/168 [00:15<00:12, 5.72it/s]
|
1402 |
+
58%|█████▊ | 97/168 [00:16<00:13, 5.09it/s]
|
1403 |
+
58%|█████▊ | 98/168 [00:16<00:14, 4.85it/s]
|
1404 |
+
59%|█████▉ | 99/168 [00:16<00:12, 5.42it/s]
|
1405 |
+
60%|█████▉ | 100/168 [00:16<00:11, 5.69it/s]
|
1406 |
+
60%|██████ | 101/168 [00:16<00:10, 6.38it/s]
|
1407 |
+
61%|██████ | 102/168 [00:16<00:10, 6.57it/s]
|
1408 |
+
61%|██████▏ | 103/168 [00:16<00:09, 7.15it/s]
|
1409 |
+
62%|██████▏ | 104/168 [00:17<00:11, 5.73it/s]
|
1410 |
+
62%|██████▎ | 105/168 [00:17<00:10, 6.08it/s]
|
1411 |
+
63%|██████▎ | 106/168 [00:17<00:09, 6.21it/s]
|
1412 |
+
64%|██████▎ | 107/168 [00:17<00:09, 6.46it/s]
|
1413 |
+
64%|██████▍ | 108/168 [00:17<00:09, 6.60it/s]
|
1414 |
+
65%|██████▍ | 109/168 [00:17<00:09, 6.36it/s]
|
1415 |
+
65%|██████▌ | 110/168 [00:18<00:08, 6.97it/s]
|
1416 |
+
66%|██████▌ | 111/168 [00:18<00:08, 6.43it/s]
|
1417 |
+
67%|██████▋ | 112/168 [00:18<00:08, 6.29it/s]
|
1418 |
+
67%|██████▋ | 113/168 [00:18<00:09, 6.02it/s]
|
1419 |
+
68%|██████▊ | 114/168 [00:18<00:08, 6.34it/s]
|
1420 |
+
68%|██████▊ | 115/168 [00:18<00:07, 6.66it/s]
|
1421 |
+
69%|██████▉ | 116/168 [00:19<00:08, 5.85it/s]
|
1422 |
+
70%|██████▉ | 117/168 [00:19<00:08, 6.21it/s]
|
1423 |
+
70%|███████ | 118/168 [00:19<00:08, 6.16it/s]
|
1424 |
+
71%|███████ | 119/168 [00:19<00:08, 5.47it/s]
|
1425 |
+
71%|███████▏ | 120/168 [00:19<00:09, 4.83it/s]
|
1426 |
+
72%|███████▏ | 121/168 [00:20<00:09, 4.99it/s]
|
1427 |
+
73%|███████▎ | 122/168 [00:20<00:09, 4.75it/s]
|
1428 |
+
73%|███████▎ | 123/168 [00:20<00:08, 5.47it/s]
|
1429 |
+
74%|███████▍ | 124/168 [00:20<00:08, 5.03it/s]
|
1430 |
+
74%|███████▍ | 125/168 [00:20<00:07, 5.49it/s]
|
1431 |
+
75%|███████▌ | 126/168 [00:20<00:06, 6.18it/s]
|
1432 |
+
76%|███████▌ | 127/168 [00:21<00:06, 6.57it/s]
|
1433 |
+
76%|███████▌ | 128/168 [00:21<00:07, 5.39it/s]
|
1434 |
+
77%|███████▋ | 129/168 [00:21<00:07, 5.44it/s]
|
1435 |
+
77%|███████▋ | 130/168 [00:21<00:07, 4.82it/s]
|
1436 |
+
78%|███████▊ | 131/168 [00:21<00:06, 5.59it/s]
|
1437 |
+
79%|███████▊ | 132/168 [00:22<00:06, 5.74it/s]
|
1438 |
+
79%|███████▉ | 133/168 [00:22<00:06, 5.67it/s]
|
1439 |
+
80%|███████▉ | 134/168 [00:22<00:05, 6.34it/s]
|
1440 |
+
80%|████████ | 135/168 [00:22<00:05, 6.52it/s]
|
1441 |
+
81%|████████ | 136/168 [00:22<00:04, 7.07it/s]
|
1442 |
+
82%|████████▏ | 137/168 [00:22<00:05, 5.69it/s]
|
1443 |
+
82%|████████▏ | 138/168 [00:23<00:05, 5.85it/s]
|
1444 |
+
83%|████████▎ | 139/168 [00:23<00:04, 6.27it/s]
|
1445 |
+
83%|████████▎ | 140/168 [00:23<00:04, 6.96it/s]
|
1446 |
+
84%|████████▍ | 141/168 [00:23<00:03, 7.33it/s]
|
1447 |
+
85%|████████▍ | 142/168 [00:23<00:03, 7.28it/s]
|
1448 |
+
85%|████████▌ | 143/168 [00:23<00:04, 5.78it/s]
|
1449 |
+
86%|████████▌ | 144/168 [00:24<00:04, 5.05it/s]
|
1450 |
+
86%|████████▋ | 145/168 [00:24<00:04, 5.33it/s]
|
1451 |
+
87%|████████▋ | 146/168 [00:24<00:03, 5.79it/s]
|
1452 |
+
88%|████████▊ | 147/168 [00:24<00:03, 6.16it/s]
|
1453 |
+
88%|████████▊ | 148/168 [00:24<00:02, 6.82it/s]
|
1454 |
+
89%|████████▊ | 149/168 [00:24<00:03, 5.53it/s]
|
1455 |
+
90%|████████▉ | 151/168 [00:25<00:02, 5.67it/s]
|
1456 |
+
91%|█████████ | 153/168 [00:25<00:02, 5.77it/s]
|
1457 |
+
92%|█████████▏| 154/168 [00:25<00:02, 6.06it/s]
|
1458 |
+
92%|█████████▏| 155/168 [00:25<00:02, 5.79it/s]
|
1459 |
+
93%|█████████▎| 156/168 [00:26<00:02, 5.38it/s]
|
1460 |
+
93%|█████████▎| 157/168 [00:26<00:02, 5.35it/s]
|
1461 |
+
94%|█████████▍| 158/168 [00:26<00:01, 6.06it/s]
|
1462 |
+
95%|█████████▍| 159/168 [00:26<00:01, 6.10it/s]
|
1463 |
+
95%|█████████▌| 160/168 [00:26<00:01, 5.81it/s]
|
1464 |
+
96%|█████████▌| 161/168 [00:26<00:01, 5.73it/s]
|
1465 |
+
96%|█████████▋| 162/168 [00:27<00:01, 5.98it/s]
|
1466 |
+
97%|█████████▋| 163/168 [00:27<00:00, 5.16it/s]
|
1467 |
+
98%|█████████▊| 164/168 [00:27<00:00, 5.61it/s]
|
1468 |
+
98%|█████████▊| 165/168 [00:27<00:00, 6.31it/s]
|
1469 |
+
99%|█████████▉| 166/168 [00:27<00:00, 6.79it/s]
|
1470 |
+
100%|██████████| 168/168 [00:28<00:00, 6.51it/s]
|
1471 |
+
100%|██████████| 168/168 [00:28<00:00, 5.99it/s]
|
1472 |
+
***** eval metrics *****
|
1473 |
+
epoch = 1.0
|
1474 |
+
eval_loss = 0.3524
|
1475 |
+
eval_runtime = 0:00:28.24
|
1476 |
+
eval_samples_per_second = 11.859
|
1477 |
+
eval_steps_per_second = 5.947
|
1478 |
+
[INFO|modelcard.py:452] 2024-01-04 10:03:56,150 >> Dropping the following result as it does not have all the necessary fields:
|
1479 |
+
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
|
train_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"train_loss": 0.4441075046011742,
|
4 |
+
"train_runtime": 553.4721,
|
5 |
+
"train_samples_per_second": 5.44,
|
6 |
+
"train_steps_per_second": 0.679
|
7 |
+
}
|
trainer_log.jsonl
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"current_steps": 10, "total_steps": 376, "loss": 1.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.991278696516879e-05, "epoch": 0.03, "percentage": 2.66, "elapsed_time": "0:00:14", "remaining_time": "0:08:49"}
|
2 |
+
{"current_steps": 20, "total_steps": 376, "loss": 0.881, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9651756349750716e-05, "epoch": 0.05, "percentage": 5.32, "elapsed_time": "0:00:28", "remaining_time": "0:08:24"}
|
3 |
+
{"current_steps": 30, "total_steps": 376, "loss": 0.7979, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9218729375518135e-05, "epoch": 0.08, "percentage": 7.98, "elapsed_time": "0:00:43", "remaining_time": "0:08:16"}
|
4 |
+
{"current_steps": 40, "total_steps": 376, "loss": 0.7022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.861672729019797e-05, "epoch": 0.11, "percentage": 10.64, "elapsed_time": "0:00:57", "remaining_time": "0:08:05"}
|
5 |
+
{"current_steps": 50, "total_steps": 376, "loss": 0.5844, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.784995028809707e-05, "epoch": 0.13, "percentage": 13.3, "elapsed_time": "0:01:11", "remaining_time": "0:07:45"}
|
6 |
+
{"current_steps": 60, "total_steps": 376, "loss": 0.4454, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.692374820516679e-05, "epoch": 0.16, "percentage": 15.96, "elapsed_time": "0:01:24", "remaining_time": "0:07:24"}
|
7 |
+
{"current_steps": 70, "total_steps": 376, "loss": 0.4076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.584458319296868e-05, "epoch": 0.19, "percentage": 18.62, "elapsed_time": "0:01:38", "remaining_time": "0:07:12"}
|
8 |
+
{"current_steps": 80, "total_steps": 376, "loss": 0.4111, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4619984631966524e-05, "epoch": 0.21, "percentage": 21.28, "elapsed_time": "0:01:52", "remaining_time": "0:06:56"}
|
9 |
+
{"current_steps": 90, "total_steps": 376, "loss": 0.4115, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3258496598716736e-05, "epoch": 0.24, "percentage": 23.94, "elapsed_time": "0:02:07", "remaining_time": "0:06:43"}
|
10 |
+
{"current_steps": 100, "total_steps": 376, "loss": 0.3566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.176961825348059e-05, "epoch": 0.27, "percentage": 26.6, "elapsed_time": "0:02:19", "remaining_time": "0:06:25"}
|
11 |
+
{"current_steps": 110, "total_steps": 376, "loss": 0.4302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.016373756417669e-05, "epoch": 0.29, "percentage": 29.26, "elapsed_time": "0:02:33", "remaining_time": "0:06:11"}
|
12 |
+
{"current_steps": 120, "total_steps": 376, "loss": 0.4271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.845205882908432e-05, "epoch": 0.32, "percentage": 31.91, "elapsed_time": "0:02:47", "remaining_time": "0:05:58"}
|
13 |
+
{"current_steps": 130, "total_steps": 376, "loss": 0.4625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6646524503974955e-05, "epoch": 0.35, "percentage": 34.57, "elapsed_time": "0:03:02", "remaining_time": "0:05:45"}
|
14 |
+
{"current_steps": 140, "total_steps": 376, "loss": 0.5066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.475973187908737e-05, "epoch": 0.37, "percentage": 37.23, "elapsed_time": "0:03:17", "remaining_time": "0:05:32"}
|
15 |
+
{"current_steps": 150, "total_steps": 376, "loss": 0.3887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.280484518729466e-05, "epoch": 0.4, "percentage": 39.89, "elapsed_time": "0:03:32", "remaining_time": "0:05:19"}
|
16 |
+
{"current_steps": 160, "total_steps": 376, "loss": 0.3675, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.079550375668821e-05, "epoch": 0.42, "percentage": 42.55, "elapsed_time": "0:03:44", "remaining_time": "0:05:03"}
|
17 |
+
{"current_steps": 170, "total_steps": 376, "loss": 0.4095, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8745726848402036e-05, "epoch": 0.45, "percentage": 45.21, "elapsed_time": "0:03:59", "remaining_time": "0:04:49"}
|
18 |
+
{"current_steps": 180, "total_steps": 376, "loss": 0.3782, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6669815843628042e-05, "epoch": 0.48, "percentage": 47.87, "elapsed_time": "0:04:13", "remaining_time": "0:04:35"}
|
19 |
+
{"current_steps": 190, "total_steps": 376, "loss": 0.4195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4582254462267476e-05, "epoch": 0.5, "percentage": 50.53, "elapsed_time": "0:04:28", "remaining_time": "0:04:22"}
|
20 |
+
{"current_steps": 200, "total_steps": 376, "loss": 0.3392, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2497607709397543e-05, "epoch": 0.53, "percentage": 53.19, "elapsed_time": "0:04:43", "remaining_time": "0:04:09"}
|
21 |
+
{"current_steps": 210, "total_steps": 376, "loss": 0.3347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0430420254607748e-05, "epoch": 0.56, "percentage": 55.85, "elapsed_time": "0:04:56", "remaining_time": "0:03:54"}
|
22 |
+
{"current_steps": 220, "total_steps": 376, "loss": 0.4117, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8395114953217852e-05, "epoch": 0.58, "percentage": 58.51, "elapsed_time": "0:05:09", "remaining_time": "0:03:39"}
|
23 |
+
{"current_steps": 230, "total_steps": 376, "loss": 0.3772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.640589221739926e-05, "epoch": 0.61, "percentage": 61.17, "elapsed_time": "0:05:23", "remaining_time": "0:03:25"}
|
24 |
+
{"current_steps": 240, "total_steps": 376, "loss": 0.4403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.447663093929163e-05, "epoch": 0.64, "percentage": 63.83, "elapsed_time": "0:05:36", "remaining_time": "0:03:10"}
|
25 |
+
{"current_steps": 250, "total_steps": 376, "loss": 0.3867, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2620791657378664e-05, "epoch": 0.66, "percentage": 66.49, "elapsed_time": "0:05:50", "remaining_time": "0:02:56"}
|
26 |
+
{"current_steps": 260, "total_steps": 376, "loss": 0.3688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0851322641735118e-05, "epoch": 0.69, "percentage": 69.15, "elapsed_time": "0:06:04", "remaining_time": "0:02:42"}
|
27 |
+
{"current_steps": 270, "total_steps": 376, "loss": 0.3655, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.180569553392535e-06, "epoch": 0.72, "percentage": 71.81, "elapsed_time": "0:06:19", "remaining_time": "0:02:28"}
|
28 |
+
{"current_steps": 280, "total_steps": 376, "loss": 0.4144, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.620189308133943e-06, "epoch": 0.74, "percentage": 74.47, "elapsed_time": "0:06:33", "remaining_time": "0:02:15"}
|
29 |
+
{"current_steps": 290, "total_steps": 376, "loss": 0.3298, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.181068745693716e-06, "epoch": 0.77, "percentage": 77.13, "elapsed_time": "0:06:47", "remaining_time": "0:02:00"}
|
30 |
+
{"current_steps": 300, "total_steps": 376, "loss": 0.3337, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.873248671810928e-06, "epoch": 0.8, "percentage": 79.79, "elapsed_time": "0:07:00", "remaining_time": "0:01:46"}
|
31 |
+
{"current_steps": 310, "total_steps": 376, "loss": 0.3217, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7058538030980942e-06, "epoch": 0.82, "percentage": 82.45, "elapsed_time": "0:07:13", "remaining_time": "0:01:32"}
|
32 |
+
{"current_steps": 320, "total_steps": 376, "loss": 0.3222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.687029103502972e-06, "epoch": 0.85, "percentage": 85.11, "elapsed_time": "0:07:26", "remaining_time": "0:01:18"}
|
33 |
+
{"current_steps": 330, "total_steps": 376, "loss": 0.3989, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.823882956546566e-06, "epoch": 0.88, "percentage": 87.77, "elapsed_time": "0:07:41", "remaining_time": "0:01:04"}
|
34 |
+
{"current_steps": 340, "total_steps": 376, "loss": 0.3805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1224375698271894e-06, "epoch": 0.9, "percentage": 90.43, "elapsed_time": "0:07:55", "remaining_time": "0:00:50"}
|
35 |
+
{"current_steps": 350, "total_steps": 376, "loss": 0.4108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.875869578203824e-07, "epoch": 0.93, "percentage": 93.09, "elapsed_time": "0:08:08", "remaining_time": "0:00:36"}
|
36 |
+
{"current_steps": 360, "total_steps": 376, "loss": 0.3578, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.230627961304993e-07, "epoch": 0.96, "percentage": 95.74, "elapsed_time": "0:08:21", "remaining_time": "0:00:22"}
|
37 |
+
{"current_steps": 370, "total_steps": 376, "loss": 0.3453, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1408385430356516e-08, "epoch": 0.98, "percentage": 98.4, "elapsed_time": "0:08:36", "remaining_time": "0:00:08"}
|
38 |
+
{"current_steps": 376, "total_steps": 376, "loss": null, "eval_loss": 0.35242682695388794, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:08:44", "remaining_time": "0:00:00"}
|
39 |
+
{"current_steps": 376, "total_steps": 376, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:08:44", "remaining_time": "0:00:00"}
|
40 |
+
{"current_steps": 168, "total_steps": 168, "loss": null, "eval_loss": 0.35242682695388794, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:09:42", "remaining_time": "0:00:00"}
|
trainer_state.json
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.99867197875166,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 376,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.03,
|
13 |
+
"learning_rate": 4.991278696516879e-05,
|
14 |
+
"loss": 1.0017,
|
15 |
+
"step": 10
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 0.05,
|
19 |
+
"learning_rate": 4.9651756349750716e-05,
|
20 |
+
"loss": 0.881,
|
21 |
+
"step": 20
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 0.08,
|
25 |
+
"learning_rate": 4.9218729375518135e-05,
|
26 |
+
"loss": 0.7979,
|
27 |
+
"step": 30
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 0.11,
|
31 |
+
"learning_rate": 4.861672729019797e-05,
|
32 |
+
"loss": 0.7022,
|
33 |
+
"step": 40
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 0.13,
|
37 |
+
"learning_rate": 4.784995028809707e-05,
|
38 |
+
"loss": 0.5844,
|
39 |
+
"step": 50
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 0.16,
|
43 |
+
"learning_rate": 4.692374820516679e-05,
|
44 |
+
"loss": 0.4454,
|
45 |
+
"step": 60
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 0.19,
|
49 |
+
"learning_rate": 4.584458319296868e-05,
|
50 |
+
"loss": 0.4076,
|
51 |
+
"step": 70
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.21,
|
55 |
+
"learning_rate": 4.4619984631966524e-05,
|
56 |
+
"loss": 0.4111,
|
57 |
+
"step": 80
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 0.24,
|
61 |
+
"learning_rate": 4.3258496598716736e-05,
|
62 |
+
"loss": 0.4115,
|
63 |
+
"step": 90
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 0.27,
|
67 |
+
"learning_rate": 4.176961825348059e-05,
|
68 |
+
"loss": 0.3566,
|
69 |
+
"step": 100
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 0.29,
|
73 |
+
"learning_rate": 4.016373756417669e-05,
|
74 |
+
"loss": 0.4302,
|
75 |
+
"step": 110
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 0.32,
|
79 |
+
"learning_rate": 3.845205882908432e-05,
|
80 |
+
"loss": 0.4271,
|
81 |
+
"step": 120
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 0.35,
|
85 |
+
"learning_rate": 3.6646524503974955e-05,
|
86 |
+
"loss": 0.4625,
|
87 |
+
"step": 130
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 0.37,
|
91 |
+
"learning_rate": 3.475973187908737e-05,
|
92 |
+
"loss": 0.5066,
|
93 |
+
"step": 140
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.4,
|
97 |
+
"learning_rate": 3.280484518729466e-05,
|
98 |
+
"loss": 0.3887,
|
99 |
+
"step": 150
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 0.42,
|
103 |
+
"learning_rate": 3.079550375668821e-05,
|
104 |
+
"loss": 0.3675,
|
105 |
+
"step": 160
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 0.45,
|
109 |
+
"learning_rate": 2.8745726848402036e-05,
|
110 |
+
"loss": 0.4095,
|
111 |
+
"step": 170
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 0.48,
|
115 |
+
"learning_rate": 2.6669815843628042e-05,
|
116 |
+
"loss": 0.3782,
|
117 |
+
"step": 180
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 0.5,
|
121 |
+
"learning_rate": 2.4582254462267476e-05,
|
122 |
+
"loss": 0.4195,
|
123 |
+
"step": 190
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 0.53,
|
127 |
+
"learning_rate": 2.2497607709397543e-05,
|
128 |
+
"loss": 0.3392,
|
129 |
+
"step": 200
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 0.56,
|
133 |
+
"learning_rate": 2.0430420254607748e-05,
|
134 |
+
"loss": 0.3347,
|
135 |
+
"step": 210
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.58,
|
139 |
+
"learning_rate": 1.8395114953217852e-05,
|
140 |
+
"loss": 0.4117,
|
141 |
+
"step": 220
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 0.61,
|
145 |
+
"learning_rate": 1.640589221739926e-05,
|
146 |
+
"loss": 0.3772,
|
147 |
+
"step": 230
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"epoch": 0.64,
|
151 |
+
"learning_rate": 1.447663093929163e-05,
|
152 |
+
"loss": 0.4403,
|
153 |
+
"step": 240
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"epoch": 0.66,
|
157 |
+
"learning_rate": 1.2620791657378664e-05,
|
158 |
+
"loss": 0.3867,
|
159 |
+
"step": 250
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"epoch": 0.69,
|
163 |
+
"learning_rate": 1.0851322641735118e-05,
|
164 |
+
"loss": 0.3688,
|
165 |
+
"step": 260
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"epoch": 0.72,
|
169 |
+
"learning_rate": 9.180569553392535e-06,
|
170 |
+
"loss": 0.3655,
|
171 |
+
"step": 270
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"epoch": 0.74,
|
175 |
+
"learning_rate": 7.620189308133943e-06,
|
176 |
+
"loss": 0.4144,
|
177 |
+
"step": 280
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.77,
|
181 |
+
"learning_rate": 6.181068745693716e-06,
|
182 |
+
"loss": 0.3298,
|
183 |
+
"step": 290
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"epoch": 0.8,
|
187 |
+
"learning_rate": 4.873248671810928e-06,
|
188 |
+
"loss": 0.3337,
|
189 |
+
"step": 300
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"epoch": 0.82,
|
193 |
+
"learning_rate": 3.7058538030980942e-06,
|
194 |
+
"loss": 0.3217,
|
195 |
+
"step": 310
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"epoch": 0.85,
|
199 |
+
"learning_rate": 2.687029103502972e-06,
|
200 |
+
"loss": 0.3222,
|
201 |
+
"step": 320
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"epoch": 0.88,
|
205 |
+
"learning_rate": 1.823882956546566e-06,
|
206 |
+
"loss": 0.3989,
|
207 |
+
"step": 330
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"epoch": 0.9,
|
211 |
+
"learning_rate": 1.1224375698271894e-06,
|
212 |
+
"loss": 0.3805,
|
213 |
+
"step": 340
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"epoch": 0.93,
|
217 |
+
"learning_rate": 5.875869578203824e-07,
|
218 |
+
"loss": 0.4108,
|
219 |
+
"step": 350
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.96,
|
223 |
+
"learning_rate": 2.230627961304993e-07,
|
224 |
+
"loss": 0.3578,
|
225 |
+
"step": 360
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"epoch": 0.98,
|
229 |
+
"learning_rate": 3.1408385430356516e-08,
|
230 |
+
"loss": 0.3453,
|
231 |
+
"step": 370
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"epoch": 1.0,
|
235 |
+
"eval_loss": 0.35242682695388794,
|
236 |
+
"eval_runtime": 28.2403,
|
237 |
+
"eval_samples_per_second": 11.862,
|
238 |
+
"eval_steps_per_second": 5.949,
|
239 |
+
"step": 376
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"epoch": 1.0,
|
243 |
+
"step": 376,
|
244 |
+
"total_flos": 2.22435081191424e+16,
|
245 |
+
"train_loss": 0.4441075046011742,
|
246 |
+
"train_runtime": 553.4721,
|
247 |
+
"train_samples_per_second": 5.44,
|
248 |
+
"train_steps_per_second": 0.679
|
249 |
+
}
|
250 |
+
],
|
251 |
+
"logging_steps": 10,
|
252 |
+
"max_steps": 376,
|
253 |
+
"num_input_tokens_seen": 0,
|
254 |
+
"num_train_epochs": 1,
|
255 |
+
"save_steps": 1000,
|
256 |
+
"total_flos": 2.22435081191424e+16,
|
257 |
+
"train_batch_size": 1,
|
258 |
+
"trial_name": null,
|
259 |
+
"trial_params": null
|
260 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43631845fad65fe78dc251bc687da889571e404213e9b9c084870732e93c38ea
|
3 |
+
size 4984
|
training_eval_loss.png
ADDED
training_loss.png
ADDED
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|