chansung commited on
Commit
abdc5bb
1 Parent(s): 989213a

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,13 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
 
8
  - generated_from_trainer
9
  base_model: google/gemma-2b
10
  datasets:
11
- - llama-duo/synth_summarize_dataset_dedup
12
  model-index:
13
  - name: gemma2b-summarize-gemini1_5flash-64k
14
  results: []
@@ -19,9 +19,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # gemma2b-summarize-gemini1_5flash-64k
21
 
22
- This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.7185
25
 
26
  ## Model description
27
 
@@ -52,7 +52,7 @@ The following hyperparameters were used during training:
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 15
56
 
57
  ### Training results
58
 
@@ -71,8 +71,6 @@ The following hyperparameters were used during training:
71
  | 0.9208 | 10.9905 | 577 | 2.7079 |
72
  | 0.9195 | 12.0 | 630 | 2.7148 |
73
  | 0.9212 | 12.9905 | 682 | 2.7154 |
74
- | 0.9136 | 14.0 | 735 | 2.7181 |
75
- | 0.9103 | 14.8571 | 780 | 2.7185 |
76
 
77
 
78
  ### Framework versions
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  base_model: google/gemma-2b
10
  datasets:
11
+ - generator
12
  model-index:
13
  - name: gemma2b-summarize-gemini1_5flash-64k
14
  results: []
 
19
 
20
  # gemma2b-summarize-gemini1_5flash-64k
21
 
22
+ This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 2.7154
25
 
26
  ## Model description
27
 
 
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
+ - num_epochs: 10
56
 
57
  ### Training results
58
 
 
71
  | 0.9208 | 10.9905 | 577 | 2.7079 |
72
  | 0.9195 | 12.0 | 630 | 2.7148 |
73
  | 0.9212 | 12.9905 | 682 | 2.7154 |
 
 
74
 
75
 
76
  ### Framework versions
adapter_config.json CHANGED
@@ -21,11 +21,11 @@
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
24
- "down_proj",
25
  "q_proj",
26
- "o_proj",
27
- "v_proj",
28
  "gate_proj",
 
 
 
29
  "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
 
24
  "q_proj",
 
 
25
  "gate_proj",
26
+ "down_proj",
27
+ "v_proj",
28
+ "o_proj",
29
  "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ae6086e7cdaa1c7742eb4042577d161b6afc3838df568f2b317918f4e82a95d
3
  size 39256960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c2747846863e52a0aaf9d156c39e150d13213de860c580c3e4dd71702df720c
3
  size 39256960
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 14.857142857142858,
3
  "eval_loss": 2.71852707862854,
4
  "eval_runtime": 0.4953,
5
  "eval_samples": 25,
6
  "eval_samples_per_second": 42.399,
7
  "eval_steps_per_second": 2.019,
8
- "total_flos": 1.2277516310308454e+18,
9
- "train_loss": 1.077159938445458,
10
- "train_runtime": 4175.1629,
11
  "train_samples": 63353,
12
- "train_samples_per_second": 47.883,
13
- "train_steps_per_second": 0.187
14
  }
 
1
  {
2
+ "epoch": 13.333333333333334,
3
  "eval_loss": 2.71852707862854,
4
  "eval_runtime": 0.4953,
5
  "eval_samples": 25,
6
  "eval_samples_per_second": 42.399,
7
  "eval_steps_per_second": 2.019,
8
+ "total_flos": 1.1018283868225536e+18,
9
+ "train_loss": 0.0,
10
+ "train_runtime": 3.5395,
11
  "train_samples": 63353,
12
+ "train_samples_per_second": 37654.826,
13
+ "train_steps_per_second": 146.913
14
  }
runs/Jun10_04-56-56_48ddfe8e991f/events.out.tfevents.1717995436.48ddfe8e991f.131950.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e19825ff88c6f61e8a91bd83bb45388c64edd43954ece8b9a928d99eeafb6a2
3
+ size 5959
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 14.857142857142858,
3
- "total_flos": 1.2277516310308454e+18,
4
- "train_loss": 1.077159938445458,
5
- "train_runtime": 4175.1629,
6
  "train_samples": 63353,
7
- "train_samples_per_second": 47.883,
8
- "train_steps_per_second": 0.187
9
  }
 
1
  {
2
+ "epoch": 13.333333333333334,
3
+ "total_flos": 1.1018283868225536e+18,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 3.5395,
6
  "train_samples": 63353,
7
+ "train_samples_per_second": 37654.826,
8
+ "train_steps_per_second": 146.913
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 14.857142857142858,
5
  "eval_steps": 500,
6
- "global_step": 780,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1100,147 +1100,19 @@
1100
  "step": 700
1101
  },
1102
  {
1103
- "epoch": 13.428571428571429,
1104
- "grad_norm": 0.23828125,
1105
- "learning_rate": 5.580037533961546e-06,
1106
- "loss": 0.9212,
1107
- "step": 705
1108
- },
1109
- {
1110
- "epoch": 13.523809523809524,
1111
- "grad_norm": 0.2373046875,
1112
- "learning_rate": 4.866728191731829e-06,
1113
- "loss": 0.909,
1114
- "step": 710
1115
- },
1116
- {
1117
- "epoch": 13.619047619047619,
1118
- "grad_norm": 0.2431640625,
1119
- "learning_rate": 4.20104876845111e-06,
1120
- "loss": 0.9146,
1121
- "step": 715
1122
- },
1123
- {
1124
- "epoch": 13.714285714285714,
1125
- "grad_norm": 0.2431640625,
1126
- "learning_rate": 3.5833325466437694e-06,
1127
- "loss": 0.9107,
1128
- "step": 720
1129
- },
1130
- {
1131
- "epoch": 13.80952380952381,
1132
- "grad_norm": 0.2451171875,
1133
- "learning_rate": 3.013888795328057e-06,
1134
- "loss": 0.9136,
1135
- "step": 725
1136
- },
1137
- {
1138
- "epoch": 13.904761904761905,
1139
- "grad_norm": 0.236328125,
1140
- "learning_rate": 2.4930026151759766e-06,
1141
- "loss": 0.9147,
1142
- "step": 730
1143
- },
1144
- {
1145
- "epoch": 14.0,
1146
- "grad_norm": 0.2470703125,
1147
- "learning_rate": 2.0209347957732328e-06,
1148
- "loss": 0.9136,
1149
- "step": 735
1150
- },
1151
- {
1152
- "epoch": 14.0,
1153
- "eval_loss": 2.7180798053741455,
1154
- "eval_runtime": 0.4838,
1155
- "eval_samples_per_second": 43.41,
1156
- "eval_steps_per_second": 2.067,
1157
- "step": 735
1158
- },
1159
- {
1160
- "epoch": 14.095238095238095,
1161
- "grad_norm": 0.23828125,
1162
- "learning_rate": 1.5979216850509848e-06,
1163
- "loss": 0.9092,
1164
- "step": 740
1165
- },
1166
- {
1167
- "epoch": 14.19047619047619,
1168
- "grad_norm": 0.2578125,
1169
- "learning_rate": 1.2241750709546917e-06,
1170
- "loss": 0.9159,
1171
- "step": 745
1172
- },
1173
- {
1174
- "epoch": 14.285714285714286,
1175
- "grad_norm": 0.236328125,
1176
- "learning_rate": 8.998820754091531e-07,
1177
- "loss": 0.9153,
1178
- "step": 750
1179
- },
1180
- {
1181
- "epoch": 14.380952380952381,
1182
- "grad_norm": 0.244140625,
1183
- "learning_rate": 6.25205060633205e-07,
1184
- "loss": 0.9114,
1185
- "step": 755
1186
- },
1187
- {
1188
- "epoch": 14.476190476190476,
1189
- "grad_norm": 0.244140625,
1190
- "learning_rate": 4.0028154785050063e-07,
1191
- "loss": 0.9163,
1192
- "step": 760
1193
- },
1194
- {
1195
- "epoch": 14.571428571428571,
1196
- "grad_norm": 0.236328125,
1197
- "learning_rate": 2.2522414843748618e-07,
1198
- "loss": 0.9153,
1199
- "step": 765
1200
- },
1201
- {
1202
- "epoch": 14.666666666666666,
1203
- "grad_norm": 0.23828125,
1204
- "learning_rate": 1.0012050754277802e-07,
1205
- "loss": 0.912,
1206
- "step": 770
1207
- },
1208
- {
1209
- "epoch": 14.761904761904763,
1210
- "grad_norm": 0.25,
1211
- "learning_rate": 2.5033260206275277e-08,
1212
- "loss": 0.9155,
1213
- "step": 775
1214
- },
1215
- {
1216
- "epoch": 14.857142857142858,
1217
- "grad_norm": 0.240234375,
1218
- "learning_rate": 0.0,
1219
- "loss": 0.9103,
1220
- "step": 780
1221
- },
1222
- {
1223
- "epoch": 14.857142857142858,
1224
- "eval_loss": 2.71852707862854,
1225
- "eval_runtime": 0.4854,
1226
- "eval_samples_per_second": 43.26,
1227
- "eval_steps_per_second": 2.06,
1228
- "step": 780
1229
- },
1230
- {
1231
- "epoch": 14.857142857142858,
1232
- "step": 780,
1233
- "total_flos": 1.2277516310308454e+18,
1234
- "train_loss": 1.077159938445458,
1235
- "train_runtime": 4175.1629,
1236
- "train_samples_per_second": 47.883,
1237
- "train_steps_per_second": 0.187
1238
  }
1239
  ],
1240
  "logging_steps": 5,
1241
- "max_steps": 780,
1242
  "num_input_tokens_seen": 0,
1243
- "num_train_epochs": 15,
1244
  "save_steps": 100,
1245
  "stateful_callbacks": {
1246
  "TrainerControl": {
@@ -1254,7 +1126,7 @@
1254
  "attributes": {}
1255
  }
1256
  },
1257
- "total_flos": 1.2277516310308454e+18,
1258
  "train_batch_size": 16,
1259
  "trial_name": null,
1260
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 13.333333333333334,
5
  "eval_steps": 500,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1100
  "step": 700
1101
  },
1102
  {
1103
+ "epoch": 13.333333333333334,
1104
+ "step": 700,
1105
+ "total_flos": 1.1018283868225536e+18,
1106
+ "train_loss": 0.0,
1107
+ "train_runtime": 3.5395,
1108
+ "train_samples_per_second": 37654.826,
1109
+ "train_steps_per_second": 146.913
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1110
  }
1111
  ],
1112
  "logging_steps": 5,
1113
+ "max_steps": 520,
1114
  "num_input_tokens_seen": 0,
1115
+ "num_train_epochs": 10,
1116
  "save_steps": 100,
1117
  "stateful_callbacks": {
1118
  "TrainerControl": {
 
1126
  "attributes": {}
1127
  }
1128
  },
1129
+ "total_flos": 1.1018283868225536e+18,
1130
  "train_batch_size": 16,
1131
  "trial_name": null,
1132
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1ffb27f0fd6a7143dd5c1775871c7ef4697a68201a714b4e32514ace64902a5
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:864c40eca7ce2f5228b456390efab4e31903bcf7dac802808ea32a49d0bbfa72
3
  size 5304