Spaces:
Runtime error
Runtime error
dsmueller
commited on
Commit
·
4ec4a3c
1
Parent(s):
e51648a
Fix num_epochs parameter in TrainingArguments
Browse files
app.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -22,7 +22,7 @@
|
|
22 |
},
|
23 |
{
|
24 |
"cell_type": "code",
|
25 |
-
"execution_count":
|
26 |
"metadata": {},
|
27 |
"outputs": [],
|
28 |
"source": [
|
@@ -37,9 +37,17 @@
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
-
"execution_count":
|
41 |
"metadata": {},
|
42 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
"source": [
|
44 |
"# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n",
|
45 |
"model_name = 'mistralai/Mistral-7B-v0.1'\n",
|
@@ -55,9 +63,20 @@
|
|
55 |
},
|
56 |
{
|
57 |
"cell_type": "code",
|
58 |
-
"execution_count":
|
59 |
"metadata": {},
|
60 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
"source": [
|
62 |
"# Write dataset files into data directory\n",
|
63 |
"data_directory = './fine_tune_data/'\n",
|
@@ -128,16 +147,30 @@
|
|
128 |
},
|
129 |
{
|
130 |
"cell_type": "code",
|
131 |
-
"execution_count":
|
132 |
"metadata": {},
|
133 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
"source": [
|
135 |
"args_custom=transformers.TrainingArguments(\n",
|
136 |
" per_device_train_batch_size=model_params['batch_size'],\n",
|
137 |
" per_device_eval_batch_size=model_params['batch_size'],\n",
|
138 |
" gradient_accumulation_steps=model_params['gradient_accumulation'],\n",
|
139 |
" warmup_ratio=model_params['warmup_ratio'],\n",
|
140 |
-
"
|
141 |
" learning_rate=model_params['lr'],\n",
|
142 |
" fp16=True,\n",
|
143 |
" logging_steps=model_params['logging_steps'],\n",
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 20,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
22 |
},
|
23 |
{
|
24 |
"cell_type": "code",
|
25 |
+
"execution_count": 21,
|
26 |
"metadata": {},
|
27 |
"outputs": [],
|
28 |
"source": [
|
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
+
"execution_count": 22,
|
41 |
"metadata": {},
|
42 |
+
"outputs": [
|
43 |
+
{
|
44 |
+
"name": "stdout",
|
45 |
+
"output_type": "stream",
|
46 |
+
"text": [
|
47 |
+
"Model Max Length: 1000000000000000019884624838656\n"
|
48 |
+
]
|
49 |
+
}
|
50 |
+
],
|
51 |
"source": [
|
52 |
"# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n",
|
53 |
"model_name = 'mistralai/Mistral-7B-v0.1'\n",
|
|
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "code",
|
66 |
+
"execution_count": 23,
|
67 |
"metadata": {},
|
68 |
+
"outputs": [
|
69 |
+
{
|
70 |
+
"name": "stdout",
|
71 |
+
"output_type": "stream",
|
72 |
+
"text": [
|
73 |
+
"Max token length train: 1121\n",
|
74 |
+
"Max token length validation: 38\n",
|
75 |
+
"Block size: 2242\n",
|
76 |
+
"{'project_name': './llms/ams_data_train-100_91a45e55-876a-4b93-a9e7-70d26238cd33', 'model_name': 'mistralai/Mistral-7B-v0.1', 'repo_id': 'ai-aerospace/ams-data-train-100-81dbb7fc-16f6-4870-a898-c1840e33430d', 'train_data': 'train_data', 'validation_data': 'validation_data', 'data_directory': './fine_tune_data/', 'block_size': 2242, 'model_max_length': 1121, 'logging_steps': -1, 'evaluation_strategy': 'epoch', 'save_total_limit': 1, 'save_strategy': 'epoch', 'mixed_precision': 'fp16', 'lr': 3e-05, 'epochs': 3, 'batch_size': 2, 'warmup_ratio': 0.1, 'gradient_accumulation': 1, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0, 'max_grad_norm': 1, 'seed': 42, 'quantization': 'int4', 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}\n"
|
77 |
+
]
|
78 |
+
}
|
79 |
+
],
|
80 |
"source": [
|
81 |
"# Write dataset files into data directory\n",
|
82 |
"data_directory = './fine_tune_data/'\n",
|
|
|
147 |
},
|
148 |
{
|
149 |
"cell_type": "code",
|
150 |
+
"execution_count": 27,
|
151 |
"metadata": {},
|
152 |
+
"outputs": [
|
153 |
+
{
|
154 |
+
"ename": "ValueError",
|
155 |
+
"evalue": "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).",
|
156 |
+
"output_type": "error",
|
157 |
+
"traceback": [
|
158 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
159 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
160 |
+
"Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m args_custom\u001b[38;5;241m=\u001b[39m\u001b[43mtransformers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTrainingArguments\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mper_device_train_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mper_device_eval_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbatch_size\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mgradient_accumulation_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgradient_accumulation\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mwarmup_ratio\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwarmup_ratio\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_train_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mepochs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mfp16\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogging_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlogging_steps\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43msave_total_limit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msave_total_limit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mevaluation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mevaluation_strategy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetric_for_best_model\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mf1\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmodel_outputs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogging_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmodel_outputs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43moptim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43moptimizer\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_grad_norm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmax_grad_norm\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr_scheduler_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_params\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mscheduler\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# Args from medium article\u001b[39;00m\n\u001b[1;32m 22\u001b[0m args_medium\u001b[38;5;241m=\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mTrainingArguments(\n\u001b[1;32m 23\u001b[0m per_device_train_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m8\u001b[39m,\n\u001b[1;32m 24\u001b[0m per_device_eval_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m32\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 40\u001b[0m report_to\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwandb\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# enable logging to W&B\u001b[39;00m\n\u001b[1;32m 41\u001b[0m )\n",
|
161 |
+
"File \u001b[0;32m<string>:121\u001b[0m, in \u001b[0;36m__init__\u001b[0;34m(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha)\u001b[0m\n",
|
162 |
+
"File \u001b[0;32m~/Repositories/HuggingFace/fine-tuning-playground/.venv/lib/python3.11/site-packages/transformers/training_args.py:1499\u001b[0m, in \u001b[0;36mTrainingArguments.__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1488\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--optim adamw_torch_fused with --fp16 requires PyTorch>2.0\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1490\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1491\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1492\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m is_torch_available()\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16 \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16_full_eval)\n\u001b[1;32m 1498\u001b[0m ):\n\u001b[0;32m-> 1499\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1501\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1502\u001b[0m )\n\u001b[1;32m 1504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1505\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1506\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m is_torch_available()\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbf16 \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbf16_full_eval)\n\u001b[1;32m 1514\u001b[0m ):\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m (`--bf16_full_eval`) can only be used on CUDA, XPU (with IPEX), NPU or CPU/TPU/NeuronCore devices.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1518\u001b[0m )\n",
|
163 |
+
"\u001b[0;31mValueError\u001b[0m: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX)."
|
164 |
+
]
|
165 |
+
}
|
166 |
+
],
|
167 |
"source": [
|
168 |
"args_custom=transformers.TrainingArguments(\n",
|
169 |
" per_device_train_batch_size=model_params['batch_size'],\n",
|
170 |
" per_device_eval_batch_size=model_params['batch_size'],\n",
|
171 |
" gradient_accumulation_steps=model_params['gradient_accumulation'],\n",
|
172 |
" warmup_ratio=model_params['warmup_ratio'],\n",
|
173 |
+
" num_train_epochs=model_params['epochs'],\n",
|
174 |
" learning_rate=model_params['lr'],\n",
|
175 |
" fp16=True,\n",
|
176 |
" logging_steps=model_params['logging_steps'],\n",
|
app.py
CHANGED
@@ -110,7 +110,7 @@ args_custom=transformers.TrainingArguments(
|
|
110 |
per_device_eval_batch_size=model_params['batch_size'],
|
111 |
gradient_accumulation_steps=model_params['gradient_accumulation'],
|
112 |
warmup_ratio=model_params['warmup_ratio'],
|
113 |
-
|
114 |
learning_rate=model_params['lr'],
|
115 |
fp16=True,
|
116 |
logging_steps=model_params['logging_steps'],
|
|
|
110 |
per_device_eval_batch_size=model_params['batch_size'],
|
111 |
gradient_accumulation_steps=model_params['gradient_accumulation'],
|
112 |
warmup_ratio=model_params['warmup_ratio'],
|
113 |
+
num_train_epochs=model_params['epochs'],
|
114 |
learning_rate=model_params['lr'],
|
115 |
fp16=True,
|
116 |
logging_steps=model_params['logging_steps'],
|