Mediocreatmybest commited on
Commit
9409bed
1 Parent(s): 6449a91

Upload InstructBlipForConditionalGeneration

Browse files
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_commit_hash": "e35db05f9ad1647926e2f09bf91cda9a2cf14d0c",
3
  "_name_or_path": "Salesforce/instructblip-vicuna-13b",
4
  "architectures": [
5
  "InstructBlipForConditionalGeneration"
@@ -9,81 +8,7 @@
9
  "model_type": "instructblip",
10
  "num_query_tokens": 32,
11
  "qformer_config": {
12
- "_name_or_path": "",
13
- "add_cross_attention": false,
14
- "architectures": null,
15
- "attention_probs_dropout_prob": 0.1,
16
- "bad_words_ids": null,
17
- "begin_suppress_tokens": null,
18
- "bos_token_id": null,
19
- "chunk_size_feed_forward": 0,
20
- "cross_attention_frequency": 2,
21
- "cross_attention_hidden_size": null,
22
- "decoder_start_token_id": null,
23
- "diversity_penalty": 0.0,
24
- "do_sample": false,
25
- "early_stopping": false,
26
- "encoder_hidden_size": 1408,
27
- "encoder_no_repeat_ngram_size": 0,
28
- "eos_token_id": null,
29
- "exponential_decay_length_penalty": null,
30
- "finetuning_task": null,
31
- "forced_bos_token_id": null,
32
- "forced_eos_token_id": null,
33
- "hidden_act": "gelu",
34
- "hidden_dropout_prob": 0.1,
35
- "hidden_size": 768,
36
- "id2label": {
37
- "0": "LABEL_0",
38
- "1": "LABEL_1"
39
- },
40
- "initializer_range": 0.02,
41
- "intermediate_size": 3072,
42
- "is_decoder": false,
43
- "is_encoder_decoder": false,
44
- "label2id": {
45
- "LABEL_0": 0,
46
- "LABEL_1": 1
47
- },
48
- "layer_norm_eps": 1e-12,
49
- "length_penalty": 1.0,
50
- "max_length": 20,
51
- "max_position_embeddings": 512,
52
- "min_length": 0,
53
  "model_type": "instructblip_qformer",
54
- "no_repeat_ngram_size": 0,
55
- "num_attention_heads": 12,
56
- "num_beam_groups": 1,
57
- "num_beams": 1,
58
- "num_hidden_layers": 12,
59
- "num_return_sequences": 1,
60
- "output_attentions": false,
61
- "output_hidden_states": false,
62
- "output_scores": false,
63
- "pad_token_id": 0,
64
- "position_embedding_type": "absolute",
65
- "prefix": null,
66
- "problem_type": null,
67
- "pruned_heads": {},
68
- "remove_invalid_values": false,
69
- "repetition_penalty": 1.0,
70
- "return_dict": true,
71
- "return_dict_in_generate": false,
72
- "sep_token_id": null,
73
- "suppress_tokens": null,
74
- "task_specific_params": null,
75
- "temperature": 1.0,
76
- "tf_legacy_loss": false,
77
- "tie_encoder_decoder": false,
78
- "tie_word_embeddings": true,
79
- "tokenizer_class": null,
80
- "top_k": 50,
81
- "top_p": 1.0,
82
- "torch_dtype": null,
83
- "torchscript": false,
84
- "transformers_version": "4.31.0",
85
- "typical_p": 1.0,
86
- "use_bfloat16": false,
87
  "vocab_size": 30523
88
  },
89
  "quantization_config": {
@@ -95,165 +20,39 @@
95
  "llm_int8_skip_modules": null,
96
  "llm_int8_threshold": 6.0,
97
  "load_in_4bit": false,
98
- "load_in_8bit": true
 
99
  },
100
  "text_config": {
101
- "_name_or_path": "",
102
- "add_cross_attention": false,
103
  "architectures": [
104
  "LLaMAForCausalLM"
105
  ],
106
- "bad_words_ids": null,
107
- "begin_suppress_tokens": null,
108
  "bos_token_id": 0,
109
- "chunk_size_feed_forward": 0,
110
- "cross_attention_hidden_size": null,
111
- "decoder_start_token_id": null,
112
- "diversity_penalty": 0.0,
113
- "do_sample": false,
114
- "early_stopping": false,
115
- "encoder_no_repeat_ngram_size": 0,
116
  "eos_token_id": 1,
117
- "exponential_decay_length_penalty": null,
118
- "finetuning_task": null,
119
- "forced_bos_token_id": null,
120
- "forced_eos_token_id": null,
121
  "hidden_act": "silu",
122
  "hidden_size": 5120,
123
- "id2label": {
124
- "0": "LABEL_0",
125
- "1": "LABEL_1"
126
- },
127
  "initializer_range": 0.02,
128
  "intermediate_size": 13824,
129
- "is_decoder": false,
130
- "is_encoder_decoder": false,
131
- "label2id": {
132
- "LABEL_0": 0,
133
- "LABEL_1": 1
134
- },
135
- "length_penalty": 1.0,
136
- "max_length": 20,
137
- "max_position_embeddings": 2048,
138
  "max_sequence_length": 2048,
139
- "min_length": 0,
140
  "model_type": "llama",
141
- "no_repeat_ngram_size": 0,
142
  "num_attention_heads": 40,
143
- "num_beam_groups": 1,
144
- "num_beams": 1,
145
  "num_hidden_layers": 40,
146
  "num_key_value_heads": 40,
147
- "num_return_sequences": 1,
148
- "output_attentions": false,
149
- "output_hidden_states": false,
150
- "output_scores": false,
151
  "pad_token_id": -1,
152
- "prefix": null,
153
  "pretraining_tp": 1,
154
- "problem_type": null,
155
- "pruned_heads": {},
156
- "remove_invalid_values": false,
157
- "repetition_penalty": 1.0,
158
- "return_dict": true,
159
- "return_dict_in_generate": false,
160
  "rms_norm_eps": 1e-06,
161
  "rope_scaling": null,
162
- "sep_token_id": null,
163
- "suppress_tokens": null,
164
- "task_specific_params": null,
165
- "temperature": 1.0,
166
- "tf_legacy_loss": false,
167
- "tie_encoder_decoder": false,
168
  "tie_word_embeddings": false,
169
- "tokenizer_class": null,
170
- "top_k": 50,
171
- "top_p": 1.0,
172
  "torch_dtype": "float16",
173
- "torchscript": false,
174
- "transformers_version": "4.31.0",
175
- "typical_p": 1.0,
176
- "use_bfloat16": false,
177
- "use_cache": true,
178
  "vocab_size": 32001
179
  },
180
  "tie_word_embeddings": false,
181
  "torch_dtype": "float16",
182
- "transformers_version": null,
183
  "use_decoder_only_language_model": true,
184
  "vision_config": {
185
- "_name_or_path": "",
186
- "add_cross_attention": false,
187
- "architectures": null,
188
- "attention_dropout": 0.0,
189
- "bad_words_ids": null,
190
- "begin_suppress_tokens": null,
191
- "bos_token_id": null,
192
- "chunk_size_feed_forward": 0,
193
- "cross_attention_hidden_size": null,
194
- "decoder_start_token_id": null,
195
- "diversity_penalty": 0.0,
196
- "do_sample": false,
197
- "early_stopping": false,
198
- "encoder_no_repeat_ngram_size": 0,
199
- "eos_token_id": null,
200
- "exponential_decay_length_penalty": null,
201
- "finetuning_task": null,
202
- "forced_bos_token_id": null,
203
- "forced_eos_token_id": null,
204
- "hidden_act": "gelu",
205
- "hidden_size": 1408,
206
- "id2label": {
207
- "0": "LABEL_0",
208
- "1": "LABEL_1"
209
- },
210
- "image_size": 224,
211
- "initializer_range": 1e-10,
212
- "intermediate_size": 6144,
213
- "is_decoder": false,
214
- "is_encoder_decoder": false,
215
- "label2id": {
216
- "LABEL_0": 0,
217
- "LABEL_1": 1
218
- },
219
- "layer_norm_eps": 1e-06,
220
- "length_penalty": 1.0,
221
- "max_length": 20,
222
- "min_length": 0,
223
- "model_type": "instructblip_vision_model",
224
- "no_repeat_ngram_size": 0,
225
- "num_attention_heads": 16,
226
- "num_beam_groups": 1,
227
- "num_beams": 1,
228
- "num_hidden_layers": 39,
229
- "num_return_sequences": 1,
230
- "output_attentions": false,
231
- "output_hidden_states": false,
232
- "output_scores": false,
233
- "pad_token_id": null,
234
- "patch_size": 14,
235
- "prefix": null,
236
- "problem_type": null,
237
- "pruned_heads": {},
238
- "qkv_bias": true,
239
- "remove_invalid_values": false,
240
- "repetition_penalty": 1.0,
241
- "return_dict": true,
242
- "return_dict_in_generate": false,
243
- "sep_token_id": null,
244
- "suppress_tokens": null,
245
- "task_specific_params": null,
246
- "temperature": 1.0,
247
- "tf_legacy_loss": false,
248
- "tie_encoder_decoder": false,
249
- "tie_word_embeddings": true,
250
- "tokenizer_class": null,
251
- "top_k": 50,
252
- "top_p": 1.0,
253
- "torch_dtype": null,
254
- "torchscript": false,
255
- "transformers_version": "4.31.0",
256
- "typical_p": 1.0,
257
- "use_bfloat16": false
258
  }
259
  }
 
1
  {
 
2
  "_name_or_path": "Salesforce/instructblip-vicuna-13b",
3
  "architectures": [
4
  "InstructBlipForConditionalGeneration"
 
8
  "model_type": "instructblip",
9
  "num_query_tokens": 32,
10
  "qformer_config": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "model_type": "instructblip_qformer",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "vocab_size": 30523
13
  },
14
  "quantization_config": {
 
20
  "llm_int8_skip_modules": null,
21
  "llm_int8_threshold": 6.0,
22
  "load_in_4bit": false,
23
+ "load_in_8bit": true,
24
+ "quant_method": "bitsandbytes"
25
  },
26
  "text_config": {
 
 
27
  "architectures": [
28
  "LLaMAForCausalLM"
29
  ],
30
+ "attention_bias": false,
 
31
  "bos_token_id": 0,
 
 
 
 
 
 
 
32
  "eos_token_id": 1,
 
 
 
 
33
  "hidden_act": "silu",
34
  "hidden_size": 5120,
 
 
 
 
35
  "initializer_range": 0.02,
36
  "intermediate_size": 13824,
 
 
 
 
 
 
 
 
 
37
  "max_sequence_length": 2048,
 
38
  "model_type": "llama",
 
39
  "num_attention_heads": 40,
 
 
40
  "num_hidden_layers": 40,
41
  "num_key_value_heads": 40,
 
 
 
 
42
  "pad_token_id": -1,
 
43
  "pretraining_tp": 1,
 
 
 
 
 
 
44
  "rms_norm_eps": 1e-06,
45
  "rope_scaling": null,
46
+ "rope_theta": 10000.0,
 
 
 
 
 
47
  "tie_word_embeddings": false,
 
 
 
48
  "torch_dtype": "float16",
 
 
 
 
 
49
  "vocab_size": 32001
50
  },
51
  "tie_word_embeddings": false,
52
  "torch_dtype": "float16",
53
+ "transformers_version": "4.36.0.dev0",
54
  "use_decoder_only_language_model": true,
55
  "vision_config": {
56
+ "model_type": "instructblip_vision_model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": -1,
6
+ "transformers_version": "4.36.0.dev0"
7
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f64b42050735dea0ae963d810a32acec1fed26e07e3b8b4e659786bbe02a2975
3
+ size 4952975416
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cbd08fe772d2d1cbce42dd5a28eb4b03c7bcb0652e33e9b4435b0e7d349ecac
3
+ size 4937206080
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fedbfd3561311adfc62a04ed388f3b5f180753d965f65b9181e7394bf729ee66
3
+ size 4666775848
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff