Kat380 commited on
Commit
f9693c8
·
verified ·
1 Parent(s): a6aa0fe

update model checkpoints

Browse files
README.md CHANGED
@@ -1,14 +1,11 @@
1
  ---
2
- license: gemma
3
  library_name: peft
4
  tags:
5
  - alignment-handbook
6
- - trl
7
- - sft
8
  - generated_from_trainer
9
- base_model: google/gemma-7b
10
  datasets:
11
  - llama-duo/synth_summarize_dataset_dedup
 
12
  model-index:
13
  - name: gemma7b-summarize-gpt4o-128k
14
  results: []
@@ -21,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 4.8761
25
 
26
  ## Model description
27
 
@@ -42,43 +39,38 @@ More information needed
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
  - train_batch_size: 4
45
- - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 2
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 16
51
- - total_eval_batch_size: 8
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 15
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:-----:|:-----:|:---------------:|
61
- | 0.8829 | 1.0 | 815 | 2.3801 |
62
- | 0.8134 | 2.0 | 1630 | 2.3417 |
63
- | 0.7643 | 3.0 | 2445 | 2.4263 |
64
- | 0.7211 | 4.0 | 3260 | 2.5582 |
65
- | 0.6713 | 5.0 | 4075 | 2.7395 |
66
- | 0.629 | 6.0 | 4890 | 3.0156 |
67
- | 0.5603 | 7.0 | 5705 | 3.2838 |
68
- | 0.5081 | 8.0 | 6520 | 3.5777 |
69
- | 0.4652 | 9.0 | 7335 | 4.0181 |
70
- | 0.434 | 10.0 | 8150 | 4.3232 |
71
- | 0.3977 | 11.0 | 8965 | 4.6346 |
72
- | 0.3878 | 12.0 | 9780 | 4.7574 |
73
- | 0.3766 | 13.0 | 10595 | 4.8475 |
74
- | 0.3665 | 14.0 | 11410 | 4.8724 |
75
- | 0.3658 | 15.0 | 12225 | 4.8761 |
76
 
77
 
78
  ### Framework versions
79
 
80
  - PEFT 0.10.0
81
  - Transformers 4.40.0
82
- - Pytorch 2.2.1+cu121
83
  - Datasets 2.18.0
84
  - Tokenizers 0.19.1
 
1
  ---
 
2
  library_name: peft
3
  tags:
4
  - alignment-handbook
 
 
5
  - generated_from_trainer
 
6
  datasets:
7
  - llama-duo/synth_summarize_dataset_dedup
8
+ base_model: google/gemma-7b
9
  model-index:
10
  - name: gemma7b-summarize-gpt4o-128k
11
  results: []
 
18
 
19
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 2.4869
22
 
23
  ## Model description
24
 
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 0.0002
41
  - train_batch_size: 4
42
+ - eval_batch_size: 2
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 8
46
  - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 10
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 1.1594 | 0.9977 | 219 | 2.6195 |
59
+ | 1.0276 | 2.0 | 439 | 2.4670 |
60
+ | 0.9492 | 2.9977 | 658 | 2.4451 |
61
+ | 0.8751 | 4.0 | 878 | 2.4359 |
62
+ | 0.8477 | 4.9977 | 1097 | 2.4390 |
63
+ | 0.809 | 6.0 | 1317 | 2.4546 |
64
+ | 0.7918 | 6.9977 | 1536 | 2.4592 |
65
+ | 0.7847 | 8.0 | 1756 | 2.4783 |
66
+ | 0.7808 | 8.9977 | 1975 | 2.4889 |
67
+ | 0.7794 | 9.9772 | 2190 | 2.4869 |
 
 
 
 
 
68
 
69
 
70
  ### Framework versions
71
 
72
  - PEFT 0.10.0
73
  - Transformers 4.40.0
74
+ - Pytorch 2.1.2+cu121
75
  - Datasets 2.18.0
76
  - Tokenizers 0.19.1
adapter_config.json CHANGED
@@ -10,23 +10,18 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 16,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "down_proj",
25
- "k_proj",
26
  "v_proj",
27
- "up_proj",
28
- "o_proj",
29
- "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 64,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 32,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
23
  "v_proj",
24
+ "q_proj"
 
 
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fd0002e4068a7964c6a73eaeea266c5304d62773841d94dd6bd2a4402d9ffa4
3
- size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:598b26d0f7b973fe163ec750abe369f3df5104ffe548f7bd9a95080fa76ea846
3
+ size 25705248
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 15.0,
3
- "eval_loss": 4.876120567321777,
4
- "eval_runtime": 2.0583,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.858,
7
- "eval_steps_per_second": 0.972,
8
- "total_flos": 9.345237271407755e+18,
9
- "train_loss": 0.8459899323898347,
10
- "train_runtime": 79658.5706,
11
- "train_samples": 119991,
12
- "train_samples_per_second": 2.455,
13
- "train_steps_per_second": 0.153
14
  }
 
1
  {
2
+ "epoch": 9.977220956719817,
3
+ "eval_loss": 2.486895799636841,
4
+ "eval_runtime": 0.2338,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.766,
7
+ "eval_steps_per_second": 4.277,
8
+ "total_flos": 6.685995643959247e+18,
9
+ "train_loss": 1.6744230088577967,
10
+ "train_runtime": 5295.0828,
11
+ "train_samples": 129221,
12
+ "train_samples_per_second": 26.519,
13
+ "train_steps_per_second": 0.414
14
  }
config.json CHANGED
@@ -23,9 +23,9 @@
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
- "bnb_4bit_quant_storage": "bfloat16",
27
  "bnb_4bit_quant_type": "nf4",
28
- "bnb_4bit_use_double_quant": true,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
 
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
+ "bnb_4bit_quant_storage": "uint8",
27
  "bnb_4bit_quant_type": "nf4",
28
+ "bnb_4bit_use_double_quant": false,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 15.0,
3
- "eval_loss": 4.876120567321777,
4
- "eval_runtime": 2.0583,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.858,
7
- "eval_steps_per_second": 0.972
8
  }
 
1
  {
2
+ "epoch": 9.977220956719817,
3
+ "eval_loss": 2.486895799636841,
4
+ "eval_runtime": 0.2338,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.766,
7
+ "eval_steps_per_second": 4.277
8
  }
runs/Jun13_07-22-13_gpu1-2/events.out.tfevents.1718234758.gpu1-2.1174782.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0ba027b8c29cbdae1013ae054b30ce0d300e784aa34d712237e3b8668922e1
3
+ size 101044
runs/Jun13_07-22-13_gpu1-2/events.out.tfevents.1718240054.gpu1-2.1174782.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468c336c14c5c8dbb22dde0be8ea36b7266b9fdcc5ab175910c28532116ba6f6
3
+ size 359
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 15.0,
3
- "total_flos": 9.345237271407755e+18,
4
- "train_loss": 0.8459899323898347,
5
- "train_runtime": 79658.5706,
6
- "train_samples": 119991,
7
- "train_samples_per_second": 2.455,
8
- "train_steps_per_second": 0.153
9
  }
 
1
  {
2
+ "epoch": 9.977220956719817,
3
+ "total_flos": 6.685995643959247e+18,
4
+ "train_loss": 1.6744230088577967,
5
+ "train_runtime": 5295.0828,
6
+ "train_samples": 129221,
7
+ "train_samples_per_second": 26.519,
8
+ "train_steps_per_second": 0.414
9
  }
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2e6f6a085ea159a9d3d15a7146c7bc6a09e26a3a7db99b2b5fc938ca1950a4c
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7dddc219ed688da0dbc3783b6e56bb5749c3e2c0e72039146a8794d8a56351c
3
  size 5176