neginashz
/

sft-qwen-25-7b-instruct-2

@@ -8,7 +8,7 @@ tags:
 datasets:
 - medalpaca/medical_meadow_medqa
 model-index:
-- name: sft-qwen-25-7b-instruct
   results: []
 ---
@@ -50,10 +50,10 @@ wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
-num_epochs: 1
 optimizer: adamw_torch
 lr_scheduler: cosine
-learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
@@ -68,8 +68,8 @@ xformers_attention:
 flash_attention: true
 warmup_steps:
-eval_steps: 10
-save_steps: 40
 evals_per_epoch:
 saves_per_epoch:
@@ -81,10 +81,11 @@ fsdp:
 fsdp_config:
 special_tokens:
-hub_model_id: neginashz/sft-qwen-25-7b-instruct
-hub_strategy: all_checkpoints
-early_stopping_patience: 3
 auto_resume_from_checkpoints: true
@@ -93,11 +94,11 @@ auto_resume_from_checkpoints: true
 </details><br>
-# sft-qwen-25-7b-instruct
 This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the medalpaca/medical_meadow_medqa dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.1055
 ## Model description
@@ -116,7 +117,7 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 1e-05
 - train_batch_size: 1
 - eval_batch_size: 1
 - seed: 42
@@ -126,8 +127,8 @@ The following hyperparameters were used during training:
 - total_eval_batch_size: 4
 - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
-- lr_scheduler_warmup_steps: 2
-- num_epochs: 1
 ### Training results
@@ -141,6 +142,11 @@ The following hyperparameters were used during training:
 | 0.1068        | 0.7407 | 60   | 0.1101          |
 | 0.1061        | 0.8642 | 70   | 0.1056          |
 | 0.118         | 0.9877 | 80   | 0.1055          |
 ### Framework versions

 datasets:
 - medalpaca/medical_meadow_medqa
 model-index:
+- name: sft-qwen-25-7b-instruct-2
   results: []
 ---
 gradient_accumulation_steps: 1
 micro_batch_size: 1
+num_epochs: 2
 optimizer: adamw_torch
 lr_scheduler: cosine
+learning_rate: 0.000005
 train_on_inputs: false
 group_by_length: false
 flash_attention: true
 warmup_steps:
+eval_steps:
+save_steps:
 evals_per_epoch:
 saves_per_epoch:
 fsdp_config:
 special_tokens:
+hub_model_id: neginashz/sft-qwen-25-7b-instruct-2
+hub_strategy:
+early_stopping_patience:
+resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 </details><br>
+# sft-qwen-25-7b-instruct-2
 This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the medalpaca/medical_meadow_medqa dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.1054
 ## Model description
 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 5e-06
 - train_batch_size: 1
 - eval_batch_size: 1
 - seed: 42
 - total_eval_batch_size: 4
 - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 4
+- num_epochs: 2
 ### Training results
 | 0.1068        | 0.7407 | 60   | 0.1101          |
 | 0.1061        | 0.8642 | 70   | 0.1056          |
 | 0.118         | 0.9877 | 80   | 0.1055          |
+| 0.0644        | 1.1111 | 90   | 0.1054          |
+| 0.0554        | 1.2346 | 100  | 0.1054          |
+| 0.0564        | 1.3580 | 110  | 0.1054          |
+| 0.0601        | 1.4815 | 120  | 0.1054          |
+| 0.0482        | 2.0    | 162  | 0.1054          |
 ### Framework versions

Untitled1.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff