Minbyul commited on
Commit
9b4afcd
1 Parent(s): 6c8ffea

Model save

Browse files
README.md CHANGED
@@ -2,16 +2,11 @@
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-v0.1
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
- - trl
10
- - sft
11
- - alignment-handbook
12
- - generated_from_trainer
13
  datasets:
14
- - HuggingFaceH4/deita-10k-v0-sft
15
  model-index:
16
  - name: mistral-7b-wo-kqa_golden-iter-sft-step1
17
  results: []
@@ -22,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # mistral-7b-wo-kqa_golden-iter-sft-step1
24
 
25
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/deita-10k-v0-sft dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 1.4157
28
 
29
  ## Model description
30
 
@@ -61,14 +56,14 @@ The following hyperparameters were used during training:
61
 
62
  | Training Loss | Epoch | Step | Validation Loss |
63
  |:-------------:|:-----:|:----:|:---------------:|
64
- | 2.0983 | 0.97 | 16 | 1.2722 |
65
- | 1.4581 | 2.0 | 33 | 1.2942 |
66
- | 0.9338 | 2.91 | 48 | 1.4157 |
67
 
68
 
69
  ### Framework versions
70
 
71
- - Transformers 4.39.0.dev0
72
- - Pytorch 2.1.2
73
  - Datasets 2.14.6
74
  - Tokenizers 0.15.2
 
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-v0.1
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
 
 
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: mistral-7b-wo-kqa_golden-iter-sft-step1
12
  results: []
 
17
 
18
  # mistral-7b-wo-kqa_golden-iter-sft-step1
19
 
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 1.1981
23
 
24
  ## Model description
25
 
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
+ | 1.0423 | 1.0 | 17 | 1.2217 |
60
+ | 0.8111 | 2.0 | 34 | 1.1844 |
61
+ | 0.6164 | 3.0 | 51 | 1.1981 |
62
 
63
 
64
  ### Framework versions
65
 
66
+ - Transformers 4.38.2
67
+ - Pytorch 2.1.2+cu121
68
  - Datasets 2.14.6
69
  - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "epoch": 2.91,
3
- "eval_loss": 1.4157191514968872,
4
- "eval_runtime": 38.1251,
5
- "eval_samples": 4044,
6
- "eval_samples_per_second": 9.023,
7
- "eval_steps_per_second": 0.577,
8
- "train_loss": 1.5385780781507492,
9
- "train_runtime": 1102.2371,
10
  "train_samples": 4750,
11
- "train_samples_per_second": 2.874,
12
- "train_steps_per_second": 0.044
13
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.8299297000847611,
4
+ "train_runtime": 879.3584,
 
 
 
 
 
5
  "train_samples": 4750,
6
+ "train_samples_per_second": 3.708,
7
+ "train_steps_per_second": 0.058
8
  }
config.json CHANGED
@@ -20,7 +20,7 @@
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
- "transformers_version": "4.39.0.dev0",
24
- "use_cache": true,
25
  "vocab_size": 32000
26
  }
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.38.2",
24
+ "use_cache": false,
25
  "vocab_size": 32000
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.39.0.dev0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.38.2"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91dc65e5a3704f7948ab62ad8d5a8fd56f660165c50af2dd1f16b10e53515450
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ded5b3b2fde00ac15f5588ad5328b06f97d367873ee22c9d116180e942b5a63
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fce9c548ed97a033e6d6bc9efc641d4f12827c157b127198660b7b46064934b5
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a963c0c6921c2b91cb3431ec845680ee02954a9aa0b3ec49bef7fc67c6d5c99
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f19f5767b63b207703920bc9c6a4df49a04203694c0570cedec7eafd52da0ba2
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eeefd30ca5428baa750aa4b851be9ae801ce8084c3d598922ca164a52fa3d69
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.91,
3
- "train_loss": 1.5385780781507492,
4
- "train_runtime": 1102.2371,
5
  "train_samples": 4750,
6
- "train_samples_per_second": 2.874,
7
- "train_steps_per_second": 0.044
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.8299297000847611,
4
+ "train_runtime": 879.3584,
5
  "train_samples": 4750,
6
+ "train_samples_per_second": 3.708,
7
+ "train_steps_per_second": 0.058
8
  }
trainer_state.json CHANGED
@@ -1,123 +1,130 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.909090909090909,
5
  "eval_steps": 500,
6
- "global_step": 48,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
- "grad_norm": 7.70100371900725,
14
- "learning_rate": 4.000000000000001e-06,
15
- "loss": 2.2437,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.3,
20
- "grad_norm": 8.07737003057859,
21
- "learning_rate": 2e-05,
22
- "loss": 2.0985,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.61,
27
- "grad_norm": 6.4902480217917775,
28
- "learning_rate": 1.9340161087325483e-05,
29
- "loss": 2.132,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.91,
34
- "grad_norm": 3.741098959561992,
35
- "learning_rate": 1.744772182743782e-05,
36
- "loss": 2.0983,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.97,
41
- "eval_loss": 1.2721720933914185,
42
- "eval_runtime": 39.3037,
43
- "eval_samples_per_second": 8.752,
44
- "eval_steps_per_second": 0.56,
45
- "step": 16
46
  },
47
  {
48
- "epoch": 1.21,
49
- "grad_norm": 5.4461225981614865,
50
- "learning_rate": 1.4572423233046386e-05,
51
- "loss": 1.7742,
52
  "step": 20
53
  },
54
  {
55
- "epoch": 1.52,
56
- "grad_norm": 4.6058371258039505,
57
- "learning_rate": 1.1093712083778748e-05,
58
- "loss": 1.4965,
59
  "step": 25
60
  },
61
  {
62
- "epoch": 1.82,
63
- "grad_norm": 4.264046294630599,
64
- "learning_rate": 7.470666176083193e-06,
65
- "loss": 1.4581,
66
  "step": 30
67
  },
68
  {
69
  "epoch": 2.0,
70
- "eval_loss": 1.2942003011703491,
71
- "eval_runtime": 39.0225,
72
- "eval_samples_per_second": 8.815,
73
- "eval_steps_per_second": 0.564,
74
- "step": 33
75
  },
76
  {
77
- "epoch": 2.12,
78
- "grad_norm": 7.641748264447652,
79
- "learning_rate": 4.181410844420473e-06,
80
- "loss": 1.2737,
81
  "step": 35
82
  },
83
  {
84
- "epoch": 2.42,
85
- "grad_norm": 5.3186753462108225,
86
- "learning_rate": 1.660021821101222e-06,
87
- "loss": 0.979,
88
  "step": 40
89
  },
90
  {
91
- "epoch": 2.73,
92
- "grad_norm": 4.053250174296059,
93
- "learning_rate": 2.392412244407294e-07,
94
- "loss": 0.9338,
95
  "step": 45
96
  },
97
  {
98
- "epoch": 2.91,
99
- "eval_loss": 1.4157191514968872,
100
- "eval_runtime": 38.8008,
101
- "eval_samples_per_second": 8.866,
102
- "eval_steps_per_second": 0.567,
103
- "step": 48
104
  },
105
  {
106
- "epoch": 2.91,
107
- "step": 48,
108
- "total_flos": 9997878558720.0,
109
- "train_loss": 1.5385780781507492,
110
- "train_runtime": 1102.2371,
111
- "train_samples_per_second": 2.874,
112
- "train_steps_per_second": 0.044
 
 
 
 
 
 
 
 
113
  }
114
  ],
115
  "logging_steps": 5,
116
- "max_steps": 48,
117
  "num_input_tokens_seen": 0,
118
  "num_train_epochs": 3,
119
  "save_steps": 500,
120
- "total_flos": 9997878558720.0,
121
  "train_batch_size": 4,
122
  "trial_name": null,
123
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 51,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
+ "grad_norm": 7.433030029804606,
14
+ "learning_rate": 3.3333333333333333e-06,
15
+ "loss": 0.9486,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.29,
20
+ "grad_norm": 132.09106671695386,
21
+ "learning_rate": 1.6666666666666667e-05,
22
+ "loss": 1.1703,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.59,
27
+ "grad_norm": 9.251568558996158,
28
+ "learning_rate": 1.961261695938319e-05,
29
+ "loss": 1.0685,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.88,
34
+ "grad_norm": 4.328676411784069,
35
+ "learning_rate": 1.8090169943749477e-05,
36
+ "loss": 1.0423,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 1.0,
41
+ "eval_loss": 1.2217166423797607,
42
+ "eval_runtime": 4.1499,
43
+ "eval_samples_per_second": 10.121,
44
+ "eval_steps_per_second": 0.723,
45
+ "step": 17
46
  },
47
  {
48
+ "epoch": 1.18,
49
+ "grad_norm": 3.5643197557582407,
50
+ "learning_rate": 1.5591929034707468e-05,
51
+ "loss": 0.8385,
52
  "step": 20
53
  },
54
  {
55
+ "epoch": 1.47,
56
+ "grad_norm": 2.778059481053355,
57
+ "learning_rate": 1.2419218955996677e-05,
58
+ "loss": 0.7726,
59
  "step": 25
60
  },
61
  {
62
+ "epoch": 1.76,
63
+ "grad_norm": 2.439799240539835,
64
+ "learning_rate": 8.954715367323468e-06,
65
+ "loss": 0.8111,
66
  "step": 30
67
  },
68
  {
69
  "epoch": 2.0,
70
+ "eval_loss": 1.184422492980957,
71
+ "eval_runtime": 3.96,
72
+ "eval_samples_per_second": 10.606,
73
+ "eval_steps_per_second": 0.758,
74
+ "step": 34
75
  },
76
  {
77
+ "epoch": 2.06,
78
+ "grad_norm": 3.1823488644464084,
79
+ "learning_rate": 5.616288532109225e-06,
80
+ "loss": 0.7542,
81
  "step": 35
82
  },
83
  {
84
+ "epoch": 2.35,
85
+ "grad_norm": 2.5658011548139052,
86
+ "learning_rate": 2.8066019966134907e-06,
87
+ "loss": 0.6539,
88
  "step": 40
89
  },
90
  {
91
+ "epoch": 2.65,
92
+ "grad_norm": 2.532024165395188,
93
+ "learning_rate": 8.645454235739903e-07,
94
+ "loss": 0.6478,
95
  "step": 45
96
  },
97
  {
98
+ "epoch": 2.94,
99
+ "grad_norm": 2.036766400192092,
100
+ "learning_rate": 2.4359497401758026e-08,
101
+ "loss": 0.6164,
102
+ "step": 50
 
103
  },
104
  {
105
+ "epoch": 3.0,
106
+ "eval_loss": 1.198138952255249,
107
+ "eval_runtime": 3.9851,
108
+ "eval_samples_per_second": 10.539,
109
+ "eval_steps_per_second": 0.753,
110
+ "step": 51
111
+ },
112
+ {
113
+ "epoch": 3.0,
114
+ "step": 51,
115
+ "total_flos": 10626017525760.0,
116
+ "train_loss": 0.8299297000847611,
117
+ "train_runtime": 879.3584,
118
+ "train_samples_per_second": 3.708,
119
+ "train_steps_per_second": 0.058
120
  }
121
  ],
122
  "logging_steps": 5,
123
+ "max_steps": 51,
124
  "num_input_tokens_seen": 0,
125
  "num_train_epochs": 3,
126
  "save_steps": 500,
127
+ "total_flos": 10626017525760.0,
128
  "train_batch_size": 4,
129
  "trial_name": null,
130
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:111dec9fac45a0af07ef767962027eba86346e587bc51c10caee109e795fc697
3
  size 6200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0b79a7e8f05fd74caa616325c3db795998c57991eb9db1d1ed05f7cc4e97d34
3
  size 6200