DorinSht commited on
Commit
1854635
1 Parent(s): 86b8432

End of training

Browse files
README.md CHANGED
@@ -15,10 +15,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # ShareGPT_llama2_68M
17
 
18
- This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 2.3636
21
- - Accuracy: 0.5808
22
 
23
  ## Model description
24
 
 
15
 
16
  # ShareGPT_llama2_68M
17
 
18
+ This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 2.3592
21
+ - Accuracy: 0.5813
22
 
23
  ## Model description
24
 
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.5813425267092882,
4
+ "eval_loss": 2.3592453002929688,
5
+ "eval_runtime": 73.2624,
6
+ "eval_samples": 1840,
7
+ "eval_samples_per_second": 25.115,
8
+ "eval_steps_per_second": 0.532,
9
+ "perplexity": 10.582961479869661,
10
+ "total_flos": 1.4536404559724544e+17,
11
+ "train_loss": 2.595605703293699,
12
+ "train_runtime": 11859.9653,
13
+ "train_samples": 90745,
14
+ "train_samples_per_second": 22.954,
15
+ "train_steps_per_second": 0.957
16
+ }
args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83655d6e7af9b50c2c73fe9e934f013ef33e3cec89e655b5e371774d2f562aa0
3
+ size 6036
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.5813425267092882,
4
+ "eval_loss": 2.3592453002929688,
5
+ "eval_runtime": 73.2624,
6
+ "eval_samples": 1840,
7
+ "eval_samples_per_second": 25.115,
8
+ "eval_steps_per_second": 0.532,
9
+ "perplexity": 10.582961479869661
10
+ }
events.out.tfevents.1717593890.isl-gpu3.8841.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c31baaf3d19b6c99f2acf76d73245ee167d7be011ca6e4ae492169276f31d551
3
+ size 411
log.txt CHANGED
@@ -1021,3 +1021,43 @@ Training completed. Do not forget to share your model on huggingface.co/models =
1021
 
1022
 
1023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
  0%| | 0/39 [00:00<?, ?it/s]
1025
  5%|▌ | 2/39 [00:01<00:27, 1.35it/s]
1026
  8%|▊ | 3/39 [00:02<00:37, 1.06s/it]
1027
  10%|█ | 4/39 [00:04<00:42, 1.22s/it]
1028
  13%|█▎ | 5/39 [00:05<00:44, 1.31s/it]
1029
  15%|█▌ | 6/39 [00:07<00:45, 1.37s/it]
1030
  18%|█▊ | 7/39 [00:08<00:45, 1.41s/it]
1031
  21%|██ | 8/39 [00:10<00:44, 1.43s/it]
1032
  23%|██▎ | 9/39 [00:11<00:43, 1.45s/it]
1033
  26%|██▌ | 10/39 [00:13<00:42, 1.46s/it]
1034
  28%|██▊ | 11/39 [00:14<00:41, 1.47s/it]
1035
  31%|███ | 12/39 [00:16<00:39, 1.47s/it]
1036
  33%|███▎ | 13/39 [00:17<00:38, 1.48s/it]
1037
  36%|███▌ | 14/39 [00:19<00:37, 1.48s/it]
1038
  38%|███▊ | 15/39 [00:20<00:35, 1.48s/it]
1039
  41%|████ | 16/39 [00:22<00:34, 1.48s/it]
1040
  44%|████▎ | 17/39 [00:23<00:32, 1.49s/it]
1041
  46%|████▌ | 18/39 [00:25<00:31, 1.49s/it]
1042
  49%|████▊ | 19/39 [00:26<00:29, 1.49s/it]
1043
  51%|█████▏ | 20/39 [00:28<00:28, 1.49s/it]
1044
  54%|█████▍ | 21/39 [00:29<00:26, 1.49s/it]
1045
  56%|█████▋ | 22/39 [00:31<00:25, 1.49s/it]
1046
  59%|█████▉ | 23/39 [00:32<00:23, 1.49s/it]
1047
  62%|██████▏ | 24/39 [00:34<00:22, 1.49s/it]
1048
  64%|██████▍ | 25/39 [00:35<00:20, 1.49s/it]
1049
  67%|██████▋ | 26/39 [00:37<00:19, 1.49s/it]
1050
  69%|██████▉ | 27/39 [00:38<00:17, 1.49s/it]
1051
  72%|███████▏ | 28/39 [00:40<00:16, 1.49s/it]
1052
  74%|███████▍ | 29/39 [00:41<00:14, 1.49s/it]
1053
  77%|███████▋ | 30/39 [00:43<00:13, 1.49s/it]
1054
  79%|███████▉ | 31/39 [00:44<00:11, 1.49s/it]
1055
  82%|████████▏ | 32/39 [00:46<00:10, 1.49s/it]
1056
  85%|████████▍ | 33/39 [00:47<00:08, 1.49s/it]
1057
  87%|████████▋ | 34/39 [00:49<00:07, 1.49s/it]
1058
  90%|████████▉ | 35/39 [00:50<00:05, 1.49s/it]
1059
  92%|█████████▏| 36/39 [00:52<00:04, 1.49s/it]
1060
  95%|█████████▍| 37/39 [00:53<00:02, 1.49s/it]
1061
  97%|█████████▋| 38/39 [00:54<00:01, 1.46s/it]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1021
 
1022
 
1023
 
1024
+ ***** train metrics *****
1025
+ epoch = 3.0
1026
+ total_flos = 135380817GF
1027
+ train_loss = 2.5956
1028
+ train_runtime = 3:17:39.96
1029
+ train_samples = 90745
1030
+ train_samples_per_second = 22.954
1031
+ train_steps_per_second = 0.957
1032
+ 06/05/2024 06:23:37 - INFO - __main__ - *** Evaluate ***
1033
+ [INFO|trainer.py:3662] 2024-06-05 06:23:37,688 >> ***** Running Evaluation *****
1034
+ [INFO|trainer.py:3664] 2024-06-05 06:23:37,688 >> Num examples = 1840
1035
+ [INFO|trainer.py:3667] 2024-06-05 06:23:37,688 >> Batch size = 48
1036
+ /home/dshteyma/miniconda3/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
1037
+ warnings.warn('Was asked to gather along dimension 0, but all '
1038
+
1039
  0%| | 0/39 [00:00<?, ?it/s]
1040
  5%|▌ | 2/39 [00:01<00:27, 1.35it/s]
1041
  8%|▊ | 3/39 [00:02<00:37, 1.06s/it]
1042
  10%|█ | 4/39 [00:04<00:42, 1.22s/it]
1043
  13%|█▎ | 5/39 [00:05<00:44, 1.31s/it]
1044
  15%|█▌ | 6/39 [00:07<00:45, 1.37s/it]
1045
  18%|█▊ | 7/39 [00:08<00:45, 1.41s/it]
1046
  21%|██ | 8/39 [00:10<00:44, 1.43s/it]
1047
  23%|██▎ | 9/39 [00:11<00:43, 1.45s/it]
1048
  26%|██▌ | 10/39 [00:13<00:42, 1.46s/it]
1049
  28%|██▊ | 11/39 [00:14<00:41, 1.47s/it]
1050
  31%|███ | 12/39 [00:16<00:39, 1.47s/it]
1051
  33%|███▎ | 13/39 [00:17<00:38, 1.48s/it]
1052
  36%|███▌ | 14/39 [00:19<00:37, 1.48s/it]
1053
  38%|███▊ | 15/39 [00:20<00:35, 1.48s/it]
1054
  41%|████ | 16/39 [00:22<00:34, 1.48s/it]
1055
  44%|████▎ | 17/39 [00:23<00:32, 1.49s/it]
1056
  46%|████▌ | 18/39 [00:25<00:31, 1.49s/it]
1057
  49%|████▊ | 19/39 [00:26<00:29, 1.49s/it]
1058
  51%|█████▏ | 20/39 [00:28<00:28, 1.49s/it]
1059
  54%|█████▍ | 21/39 [00:29<00:26, 1.49s/it]
1060
  56%|█████▋ | 22/39 [00:31<00:25, 1.49s/it]
1061
  59%|█████▉ | 23/39 [00:32<00:23, 1.49s/it]
1062
  62%|██████▏ | 24/39 [00:34<00:22, 1.49s/it]
1063
  64%|██████▍ | 25/39 [00:35<00:20, 1.49s/it]
1064
  67%|██████▋ | 26/39 [00:37<00:19, 1.49s/it]
1065
  69%|██████▉ | 27/39 [00:38<00:17, 1.49s/it]
1066
  72%|███████▏ | 28/39 [00:40<00:16, 1.49s/it]
1067
  74%|███████▍ | 29/39 [00:41<00:14, 1.49s/it]
1068
  77%|███████▋ | 30/39 [00:43<00:13, 1.49s/it]
1069
  79%|███████▉ | 31/39 [00:44<00:11, 1.49s/it]
1070
  82%|████████▏ | 32/39 [00:46<00:10, 1.49s/it]
1071
  85%|████████▍ | 33/39 [00:47<00:08, 1.49s/it]
1072
  87%|████████▋ | 34/39 [00:49<00:07, 1.49s/it]
1073
  90%|████████▉ | 35/39 [00:50<00:05, 1.49s/it]
1074
  92%|█████████▏| 36/39 [00:52<00:04, 1.49s/it]
1075
  95%|█████████▍| 37/39 [00:53<00:02, 1.49s/it]
1076
  97%|█████████▋| 38/39 [00:54<00:01, 1.46s/it]
1077
+ [INFO|trainer.py:3353] 2024-06-05 06:24:50,968 >> Saving model checkpoint to ./training_outputs_job_117568_1_05-06_03-05
1078
+ [INFO|configuration_utils.py:471] 2024-06-05 06:24:50,983 >> Configuration saved in ./training_outputs_job_117568_1_05-06_03-05/config.json
1079
+ [INFO|configuration_utils.py:705] 2024-06-05 06:24:50,989 >> Configuration saved in ./training_outputs_job_117568_1_05-06_03-05/generation_config.json
1080
+ [INFO|modeling_utils.py:2592] 2024-06-05 06:24:51,930 >> Model weights saved in ./training_outputs_job_117568_1_05-06_03-05/model.safetensors
1081
+ [INFO|tokenization_utils_base.py:2503] 2024-06-05 06:24:51,943 >> tokenizer config file saved in ./training_outputs_job_117568_1_05-06_03-05/tokenizer_config.json
1082
+ [INFO|tokenization_utils_base.py:2512] 2024-06-05 06:24:51,948 >> Special tokens file saved in ./training_outputs_job_117568_1_05-06_03-05/special_tokens_map.json
1083
+ [INFO|modelcard.py:450] 2024-06-05 06:24:52,181 >> Dropping the following result as it does not have all the necessary fields:
1084
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.5813425267092882}]}
1085
+ ***** eval metrics *****
1086
+ epoch = 3.0
1087
+ eval_accuracy = 0.5813
1088
+ eval_loss = 2.3592
1089
+ eval_runtime = 0:01:13.26
1090
+ eval_samples = 1840
1091
+ eval_samples_per_second = 25.115
1092
+ eval_steps_per_second = 0.532
1093
+ perplexity = 10.583
1094
+
1095
+
1096
+
1097
+
1098
+
1099
+
1100
+
1101
+
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1.4536404559724544e+17,
4
+ "train_loss": 2.595605703293699,
5
+ "train_runtime": 11859.9653,
6
+ "train_samples": 90745,
7
+ "train_samples_per_second": 22.954,
8
+ "train_steps_per_second": 0.957
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 11346,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.13220518244315177,
13
+ "grad_norm": 0.8546391725540161,
14
+ "learning_rate": 8.816009873931059e-05,
15
+ "loss": 5.1118,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.26441036488630354,
20
+ "grad_norm": 0.8593688607215881,
21
+ "learning_rate": 9.59831475011252e-05,
22
+ "loss": 3.406,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.26441036488630354,
27
+ "eval_accuracy": 0.5035306174465283,
28
+ "eval_loss": 3.23445987701416,
29
+ "eval_runtime": 74.0522,
30
+ "eval_samples_per_second": 24.847,
31
+ "eval_steps_per_second": 0.527,
32
+ "step": 1000
33
+ },
34
+ {
35
+ "epoch": 0.3966155473294553,
36
+ "grad_norm": 0.9617258906364441,
37
+ "learning_rate": 9.134314230431938e-05,
38
+ "loss": 3.0005,
39
+ "step": 1500
40
+ },
41
+ {
42
+ "epoch": 0.5288207297726071,
43
+ "grad_norm": 0.8953185677528381,
44
+ "learning_rate": 8.670313710751356e-05,
45
+ "loss": 2.8119,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 0.5288207297726071,
50
+ "eval_accuracy": 0.5365118094348038,
51
+ "eval_loss": 2.821384906768799,
52
+ "eval_runtime": 72.8065,
53
+ "eval_samples_per_second": 25.272,
54
+ "eval_steps_per_second": 0.536,
55
+ "step": 2000
56
+ },
57
+ {
58
+ "epoch": 0.6610259122157589,
59
+ "grad_norm": 1.4154396057128906,
60
+ "learning_rate": 8.206313191070773e-05,
61
+ "loss": 2.686,
62
+ "step": 2500
63
+ },
64
+ {
65
+ "epoch": 0.7932310946589106,
66
+ "grad_norm": 1.821349024772644,
67
+ "learning_rate": 7.742312671390191e-05,
68
+ "loss": 2.607,
69
+ "step": 3000
70
+ },
71
+ {
72
+ "epoch": 0.7932310946589106,
73
+ "eval_accuracy": 0.5497897240925214,
74
+ "eval_loss": 2.657219886779785,
75
+ "eval_runtime": 72.9553,
76
+ "eval_samples_per_second": 25.221,
77
+ "eval_steps_per_second": 0.535,
78
+ "step": 3000
79
+ },
80
+ {
81
+ "epoch": 0.9254362771020624,
82
+ "grad_norm": 2.0297396183013916,
83
+ "learning_rate": 7.278312151709609e-05,
84
+ "loss": 2.5642,
85
+ "step": 3500
86
+ },
87
+ {
88
+ "epoch": 1.0576414595452142,
89
+ "grad_norm": 2.8318285942077637,
90
+ "learning_rate": 6.814311632029027e-05,
91
+ "loss": 2.4734,
92
+ "step": 4000
93
+ },
94
+ {
95
+ "epoch": 1.0576414595452142,
96
+ "eval_accuracy": 0.5582058048894458,
97
+ "eval_loss": 2.5735702514648438,
98
+ "eval_runtime": 73.5253,
99
+ "eval_samples_per_second": 25.025,
100
+ "eval_steps_per_second": 0.53,
101
+ "step": 4000
102
+ },
103
+ {
104
+ "epoch": 1.189846641988366,
105
+ "grad_norm": 2.178349018096924,
106
+ "learning_rate": 6.350311112348446e-05,
107
+ "loss": 2.4542,
108
+ "step": 4500
109
+ },
110
+ {
111
+ "epoch": 1.3220518244315178,
112
+ "grad_norm": 2.944026470184326,
113
+ "learning_rate": 5.886310592667864e-05,
114
+ "loss": 2.4335,
115
+ "step": 5000
116
+ },
117
+ {
118
+ "epoch": 1.3220518244315178,
119
+ "eval_accuracy": 0.5617462458316518,
120
+ "eval_loss": 2.5335800647735596,
121
+ "eval_runtime": 73.7726,
122
+ "eval_samples_per_second": 24.941,
123
+ "eval_steps_per_second": 0.529,
124
+ "step": 5000
125
+ },
126
+ {
127
+ "epoch": 1.4542570068746694,
128
+ "grad_norm": 3.0737102031707764,
129
+ "learning_rate": 5.422310072987282e-05,
130
+ "loss": 2.4012,
131
+ "step": 5500
132
+ },
133
+ {
134
+ "epoch": 1.5864621893178212,
135
+ "grad_norm": 3.8758912086486816,
136
+ "learning_rate": 4.9583095533066995e-05,
137
+ "loss": 2.3858,
138
+ "step": 6000
139
+ },
140
+ {
141
+ "epoch": 1.5864621893178212,
142
+ "eval_accuracy": 0.5662889488328625,
143
+ "eval_loss": 2.4896483421325684,
144
+ "eval_runtime": 73.8595,
145
+ "eval_samples_per_second": 24.912,
146
+ "eval_steps_per_second": 0.528,
147
+ "step": 6000
148
+ },
149
+ {
150
+ "epoch": 1.718667371760973,
151
+ "grad_norm": 7.714890956878662,
152
+ "learning_rate": 4.4943090336261176e-05,
153
+ "loss": 2.3797,
154
+ "step": 6500
155
+ },
156
+ {
157
+ "epoch": 1.8508725542041247,
158
+ "grad_norm": 7.321514129638672,
159
+ "learning_rate": 4.030308513945535e-05,
160
+ "loss": 2.374,
161
+ "step": 7000
162
+ },
163
+ {
164
+ "epoch": 1.8508725542041247,
165
+ "eval_accuracy": 0.5680874450415242,
166
+ "eval_loss": 2.4711225032806396,
167
+ "eval_runtime": 73.2926,
168
+ "eval_samples_per_second": 25.105,
169
+ "eval_steps_per_second": 0.532,
170
+ "step": 7000
171
+ },
172
+ {
173
+ "epoch": 1.9830777366472767,
174
+ "grad_norm": 5.710904598236084,
175
+ "learning_rate": 3.566307994264953e-05,
176
+ "loss": 2.3445,
177
+ "step": 7500
178
+ },
179
+ {
180
+ "epoch": 2.1152829190904283,
181
+ "grad_norm": 16.555458068847656,
182
+ "learning_rate": 3.1023074745843715e-05,
183
+ "loss": 2.3044,
184
+ "step": 8000
185
+ },
186
+ {
187
+ "epoch": 2.1152829190904283,
188
+ "eval_accuracy": 0.5701761326225017,
189
+ "eval_loss": 2.4485366344451904,
190
+ "eval_runtime": 73.4687,
191
+ "eval_samples_per_second": 25.045,
192
+ "eval_steps_per_second": 0.531,
193
+ "step": 8000
194
+ },
195
+ {
196
+ "epoch": 2.24748810153358,
197
+ "grad_norm": 6.3735737800598145,
198
+ "learning_rate": 2.6383069549037897e-05,
199
+ "loss": 2.2922,
200
+ "step": 8500
201
+ },
202
+ {
203
+ "epoch": 2.379693283976732,
204
+ "grad_norm": 19.310932159423828,
205
+ "learning_rate": 2.1743064352232075e-05,
206
+ "loss": 2.2873,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 2.379693283976732,
211
+ "eval_accuracy": 0.5747236146216096,
212
+ "eval_loss": 2.413153886795044,
213
+ "eval_runtime": 73.9,
214
+ "eval_samples_per_second": 24.899,
215
+ "eval_steps_per_second": 0.528,
216
+ "step": 9000
217
+ },
218
+ {
219
+ "epoch": 2.5118984664198836,
220
+ "grad_norm": 6.743342876434326,
221
+ "learning_rate": 1.7103059155426253e-05,
222
+ "loss": 2.2626,
223
+ "step": 9500
224
+ },
225
+ {
226
+ "epoch": 2.6441036488630356,
227
+ "grad_norm": 9.461324691772461,
228
+ "learning_rate": 1.2463053958620433e-05,
229
+ "loss": 2.2669,
230
+ "step": 10000
231
+ },
232
+ {
233
+ "epoch": 2.6441036488630356,
234
+ "eval_accuracy": 0.5778246532571526,
235
+ "eval_loss": 2.387904167175293,
236
+ "eval_runtime": 73.5959,
237
+ "eval_samples_per_second": 25.001,
238
+ "eval_steps_per_second": 0.53,
239
+ "step": 10000
240
+ },
241
+ {
242
+ "epoch": 2.776308831306187,
243
+ "grad_norm": 10.6486234664917,
244
+ "learning_rate": 7.823048761814613e-06,
245
+ "loss": 2.2557,
246
+ "step": 10500
247
+ },
248
+ {
249
+ "epoch": 2.908514013749339,
250
+ "grad_norm": 8.470663070678711,
251
+ "learning_rate": 3.183043565008793e-06,
252
+ "loss": 2.2463,
253
+ "step": 11000
254
+ },
255
+ {
256
+ "epoch": 2.908514013749339,
257
+ "eval_accuracy": 0.5807555064675771,
258
+ "eval_loss": 2.363588571548462,
259
+ "eval_runtime": 73.5703,
260
+ "eval_samples_per_second": 25.01,
261
+ "eval_steps_per_second": 0.53,
262
+ "step": 11000
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "step": 11346,
267
+ "total_flos": 1.4536404559724544e+17,
268
+ "train_loss": 2.595605703293699,
269
+ "train_runtime": 11859.9653,
270
+ "train_samples_per_second": 22.954,
271
+ "train_steps_per_second": 0.957
272
+ }
273
+ ],
274
+ "logging_steps": 500,
275
+ "max_steps": 11346,
276
+ "num_input_tokens_seen": 0,
277
+ "num_train_epochs": 3,
278
+ "save_steps": 1000,
279
+ "stateful_callbacks": {
280
+ "TrainerControl": {
281
+ "args": {
282
+ "should_epoch_stop": false,
283
+ "should_evaluate": false,
284
+ "should_log": false,
285
+ "should_save": true,
286
+ "should_training_stop": false
287
+ },
288
+ "attributes": {}
289
+ }
290
+ },
291
+ "total_flos": 1.4536404559724544e+17,
292
+ "train_batch_size": 24,
293
+ "trial_name": null,
294
+ "trial_params": null
295
+ }