Natthaphon commited on
Commit
e137a83
·
1 Parent(s): 056a690

Added model

Browse files
config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/media/palm/BiggerData/capgen/hub/pth/gpt2_clip_1e-4_encoder_freeze",
3
+ "architectures": [
4
+ "CLIPEncoderDecoderModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_clipcap.CLIPEncoderDecoderConfig",
8
+ "AutoModel": "modeling_clipcap.CLIPEncoderDecoderModel"
9
+ },
10
+ "decoder": {
11
+ "_name_or_path": "/project/lt200203-aimedi/palm/huggingface/gpt2",
12
+ "add_cross_attention": true,
13
+ "architectures": [
14
+ "GPT2LMHeadModel"
15
+ ],
16
+ "is_decoder": true,
17
+ "model_type": "gpt2",
18
+ "n_ctx": 1024,
19
+ "task_specific_params": {
20
+ "text-generation": {
21
+ "do_sample": true,
22
+ "max_length": 50
23
+ }
24
+ }
25
+ },
26
+ "decoder_start_token_id": 50256,
27
+ "encoder": {
28
+ "_name_or_path": "",
29
+ "add_cross_attention": false,
30
+ "architectures": null,
31
+ "attention_dropout": 0.0,
32
+ "bad_words_ids": null,
33
+ "begin_suppress_tokens": null,
34
+ "bos_token_id": null,
35
+ "chunk_size_feed_forward": 0,
36
+ "cross_attention_hidden_size": null,
37
+ "decoder_start_token_id": null,
38
+ "diversity_penalty": 0.0,
39
+ "do_sample": false,
40
+ "dropout": 0.0,
41
+ "early_stopping": false,
42
+ "encoder_no_repeat_ngram_size": 0,
43
+ "eos_token_id": null,
44
+ "exponential_decay_length_penalty": null,
45
+ "finetuning_task": null,
46
+ "forced_bos_token_id": null,
47
+ "forced_eos_token_id": null,
48
+ "hidden_act": "quick_gelu",
49
+ "hidden_size": 512,
50
+ "id2label": {
51
+ "0": "LABEL_0",
52
+ "1": "LABEL_1"
53
+ },
54
+ "image_size": 224,
55
+ "initializer_factor": 1.0,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "is_decoder": false,
59
+ "is_encoder_decoder": false,
60
+ "label2id": {
61
+ "LABEL_0": 0,
62
+ "LABEL_1": 1
63
+ },
64
+ "layer_norm_eps": 1e-05,
65
+ "length_penalty": 1.0,
66
+ "max_length": 20,
67
+ "min_length": 0,
68
+ "model_type": "clip_vision_model",
69
+ "no_repeat_ngram_size": 0,
70
+ "num_attention_heads": 12,
71
+ "num_beam_groups": 1,
72
+ "num_beams": 1,
73
+ "num_channels": 3,
74
+ "num_hidden_layers": 12,
75
+ "num_return_sequences": 1,
76
+ "output_attentions": false,
77
+ "output_hidden_states": false,
78
+ "output_scores": false,
79
+ "pad_token_id": null,
80
+ "patch_size": 32,
81
+ "prefix": null,
82
+ "problem_type": null,
83
+ "projection_dim": 512,
84
+ "pruned_heads": {},
85
+ "remove_invalid_values": false,
86
+ "repetition_penalty": 1.0,
87
+ "return_dict": true,
88
+ "return_dict_in_generate": false,
89
+ "sep_token_id": null,
90
+ "suppress_tokens": null,
91
+ "task_specific_params": null,
92
+ "temperature": 1.0,
93
+ "tf_legacy_loss": false,
94
+ "tie_encoder_decoder": false,
95
+ "tie_word_embeddings": true,
96
+ "tokenizer_class": null,
97
+ "top_k": 50,
98
+ "top_p": 1.0,
99
+ "torch_dtype": null,
100
+ "torchscript": false,
101
+ "typical_p": 1.0,
102
+ "use_bfloat16": false
103
+ },
104
+ "eos_token_id": 50256,
105
+ "is_encoder_decoder": true,
106
+ "model_type": "clip-encoder-decoder",
107
+ "pad_token_id": 50256,
108
+ "tie_word_embeddings": false,
109
+ "torch_dtype": "float32",
110
+ "transformers_version": "4.36.2"
111
+ }
configuration_clipcap.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, AutoConfig
2
+
3
+
4
+ class CLIPEncoderDecoderConfig(PretrainedConfig):
5
+ model_type = "clip-encoder-decoder"
6
+
7
+ def __init__(
8
+ self,
9
+ decoder={'_name_or_path': '',
10
+ 'activation_function': 'gelu_new',
11
+ 'add_cross_attention': True,
12
+ 'architectures': ['GPT2LMHeadModel'],
13
+ 'attn_pdrop': 0.1,
14
+ 'bad_words_ids': None,
15
+ 'begin_suppress_tokens': None,
16
+ 'bos_token_id': 50256,
17
+ 'chunk_size_feed_forward': 0,
18
+ 'cross_attention_hidden_size': None,
19
+ 'decoder_start_token_id': None,
20
+ 'diversity_penalty': 0.0,
21
+ 'do_sample': False,
22
+ 'early_stopping': False,
23
+ 'embd_pdrop': 0.1,
24
+ 'encoder_no_repeat_ngram_size': 0,
25
+ 'eos_token_id': 50256,
26
+ 'exponential_decay_length_penalty': None,
27
+ 'finetuning_task': None,
28
+ 'forced_bos_token_id': None,
29
+ 'forced_eos_token_id': None,
30
+ 'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
31
+ 'initializer_range': 0.02,
32
+ 'is_decoder': True,
33
+ 'is_encoder_decoder': False,
34
+ 'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
35
+ 'layer_norm_epsilon': 1e-05,
36
+ 'length_penalty': 1.0,
37
+ 'max_length': 20,
38
+ 'min_length': 0,
39
+ 'model_type': 'gpt2',
40
+ 'n_ctx': 1024,
41
+ 'n_embd': 768,
42
+ 'n_head': 12,
43
+ 'n_inner': None,
44
+ 'n_layer': 12,
45
+ 'n_positions': 1024,
46
+ 'no_repeat_ngram_size': 0,
47
+ 'num_beam_groups': 1,
48
+ 'num_beams': 1,
49
+ 'num_return_sequences': 1,
50
+ 'output_attentions': False,
51
+ 'output_hidden_states': False,
52
+ 'output_scores': False,
53
+ 'pad_token_id': None,
54
+ 'prefix': None,
55
+ 'problem_type': None,
56
+ 'pruned_heads': {},
57
+ 'remove_invalid_values': False,
58
+ 'reorder_and_upcast_attn': False,
59
+ 'repetition_penalty': 1.0,
60
+ 'resid_pdrop': 0.1,
61
+ 'return_dict': True,
62
+ 'return_dict_in_generate': False,
63
+ 'scale_attn_by_inverse_layer_idx': False,
64
+ 'scale_attn_weights': True,
65
+ 'sep_token_id': None,
66
+ 'summary_activation': None,
67
+ 'summary_first_dropout': 0.1,
68
+ 'summary_proj_to_labels': True,
69
+ 'summary_type': 'cls_index',
70
+ 'summary_use_proj': True,
71
+ 'suppress_tokens': None,
72
+ 'task_specific_params': {'text-generation': {'do_sample': True,
73
+ 'max_length': 50}},
74
+ 'temperature': 1.0,
75
+ 'tf_legacy_loss': False,
76
+ 'tie_encoder_decoder': False,
77
+ 'tie_word_embeddings': True,
78
+ 'tokenizer_class': None,
79
+ 'top_k': 50,
80
+ 'top_p': 1.0,
81
+ 'torch_dtype': None,
82
+ 'torchscript': False,
83
+ 'typical_p': 1.0,
84
+ 'use_bfloat16': False,
85
+ 'use_cache': True,
86
+ 'vocab_size': 50257},
87
+ **kwargs):
88
+ super().__init__(**kwargs)
89
+
90
+ self.decoder = AutoConfig.for_model(**decoder)
91
+ self.is_encoder_decoder = True
92
+
93
+ @classmethod
94
+ def from_encoder_decoder_configs(
95
+ cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
96
+ ) -> PretrainedConfig:
97
+ r"""
98
+ Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
99
+ configuration and decoder model configuration.
100
+
101
+ Returns:
102
+ [`VisionEncoderDecoderConfig`]: An instance of a configuration object
103
+ """
104
+ decoder_config.is_decoder = True
105
+ decoder_config.add_cross_attention = True
106
+
107
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.36.2"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04fa063f78c3046b68b78d319a30fac67d7a0c38f6343109e0d9b7fa084490dd
3
+ size 1118642856
modeling_clipcap.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ PreTrainedModel,
3
+ VisionEncoderDecoderModel,
4
+ VisionEncoderDecoderConfig,
5
+ AutoModel,
6
+ AutoModelForCausalLM,
7
+ AutoConfig
8
+ )
9
+ from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
10
+ from torch import nn
11
+ from .configuration_clipcap import CLIPEncoderDecoderConfig
12
+ from typing import Optional, Tuple, Union
13
+ import torch
14
+ import gc
15
+ import os
16
+ import tempfile
17
+
18
+
19
+ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
20
+ """
21
+ Shift input ids one token to the right.
22
+ """
23
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
24
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
25
+ if decoder_start_token_id is None:
26
+ raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
27
+ shifted_input_ids[:, 0] = decoder_start_token_id
28
+
29
+ if pad_token_id is None:
30
+ raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
31
+ # replace possible -100 values in labels by `pad_token_id`
32
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
33
+
34
+ return shifted_input_ids
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ main_input_name = 'pixel_values'
39
+ def __init__(self):
40
+ super().__init__()
41
+ clip = AutoModel.from_pretrained('openai/clip-vit-base-patch32')
42
+ self.vision_model = clip.vision_model
43
+ self.visual_projection = clip.visual_projection
44
+ self.config = clip.vision_model.config
45
+ self.config.hidden_size = clip.config.projection_dim
46
+
47
+ def forward(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=False, **kwargs):
48
+ vision_outputs = self.vision_model(
49
+ pixel_values=pixel_values,
50
+ output_attentions=output_attentions,
51
+ output_hidden_states=output_hidden_states,
52
+ return_dict=return_dict,
53
+ )
54
+
55
+ pooled_output = vision_outputs[1] # pooled_output
56
+ image_features = self.visual_projection(pooled_output).view(pooled_output.size(0), 1, -1)
57
+ return BaseModelOutput(last_hidden_state=image_features)
58
+ def get_output_embeddings(self):
59
+ pass
60
+
61
+ class CLIPEncoderDecoderModel(PreTrainedModel):
62
+ config_class = CLIPEncoderDecoderConfig
63
+ base_model_prefix = "clip_encoder_decoder"
64
+ main_input_name = "pixel_values"
65
+ supports_gradient_checkpointing = True
66
+ def __init__(
67
+ self,
68
+ config = None,
69
+ encoder = None,
70
+ decoder = None,
71
+ ):
72
+ config.tie_word_embeddings = False
73
+ super().__init__(config)
74
+
75
+ encoder = Encoder()
76
+ encoder_hidden_size = encoder.config.hidden_size
77
+
78
+ if decoder is None:
79
+ decoder = AutoModelForCausalLM.from_config(config.decoder)
80
+
81
+ self.encoder = encoder
82
+ self.decoder = decoder
83
+
84
+ self.encoder.config = self.config.encoder
85
+ self.decoder.config = self.config.decoder
86
+
87
+ self.enc_to_dec_proj = nn.Linear(encoder_hidden_size, self.decoder.config.hidden_size)
88
+
89
+ def get_encoder(self):
90
+ return self.encoder
91
+
92
+ def get_decoder(self):
93
+ return self.decoder
94
+
95
+ def get_output_embeddings(self):
96
+ return self.decoder.get_output_embeddings()
97
+
98
+ def set_output_embeddings(self, new_embeddings):
99
+ return self.decoder.set_output_embeddings(new_embeddings)
100
+
101
+ @classmethod
102
+ def from_encoder_decoder_pretrained(
103
+ cls,
104
+ encoder_pretrained_model_name_or_path: str = None,
105
+ decoder_pretrained_model_name_or_path: str = None,
106
+ *model_args,
107
+ **kwargs,
108
+ ) -> PreTrainedModel:
109
+ kwargs_encoder = {
110
+ argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
111
+ }
112
+
113
+ kwargs_decoder = {
114
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
115
+ }
116
+
117
+ # remove encoder, decoder kwargs from kwargs
118
+ for key in kwargs_encoder.keys():
119
+ del kwargs["encoder_" + key]
120
+ for key in kwargs_decoder.keys():
121
+ del kwargs["decoder_" + key]
122
+
123
+ # Load and initialize the encoder and decoder
124
+ # The distinction between encoder and decoder at the model level is made
125
+ # by the value of the flag `is_decoder` that we need to set correctly.
126
+ encoder = kwargs_encoder.pop("model", None)
127
+ if encoder is None:
128
+ if encoder_pretrained_model_name_or_path is None:
129
+ raise ValueError(
130
+ "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
131
+ "to be defined."
132
+ )
133
+
134
+ if "config" not in kwargs_encoder:
135
+ encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
136
+ encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
137
+ )
138
+
139
+ if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
140
+ encoder_config.is_decoder = False
141
+ encoder_config.add_cross_attention = False
142
+
143
+ kwargs_encoder["config"] = encoder_config
144
+
145
+ encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
146
+
147
+ decoder = kwargs_decoder.pop("model", None)
148
+ if decoder is None:
149
+ if decoder_pretrained_model_name_or_path is None:
150
+ raise ValueError(
151
+ "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
152
+ "to be defined."
153
+ )
154
+
155
+ if "config" not in kwargs_decoder:
156
+ decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
157
+ decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
158
+ )
159
+
160
+ if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
161
+ decoder_config.is_decoder = True
162
+ decoder_config.add_cross_attention = True
163
+
164
+ kwargs_decoder["config"] = decoder_config
165
+
166
+ decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
167
+
168
+ # instantiate config with corresponding kwargs
169
+ config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
170
+
171
+ # make sure input & output embeddings is not tied
172
+ config.tie_word_embeddings = False
173
+ return cls(encoder=encoder, decoder=decoder, config=config)
174
+
175
+ def forward(
176
+ self,
177
+ pixel_values: Optional[torch.FloatTensor] = None,
178
+ decoder_input_ids: Optional[torch.LongTensor] = None,
179
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
180
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
181
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
182
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
183
+ labels: Optional[torch.LongTensor] = None,
184
+ use_cache: Optional[bool] = None,
185
+ output_attentions: Optional[bool] = None,
186
+ output_hidden_states: Optional[bool] = None,
187
+ return_dict: Optional[bool] = None,
188
+ **kwargs,
189
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
190
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
191
+
192
+ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
193
+
194
+ kwargs_decoder = {
195
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
196
+ }
197
+
198
+ if encoder_outputs is None:
199
+ if pixel_values is None:
200
+ raise ValueError("You have to specify pixel_values")
201
+
202
+ encoder_outputs = self.encoder(
203
+ pixel_values,
204
+ output_attentions=output_attentions,
205
+ output_hidden_states=output_hidden_states,
206
+ return_dict=return_dict,
207
+ **kwargs_encoder,
208
+ )
209
+ elif isinstance(encoder_outputs, tuple):
210
+ encoder_outputs = BaseModelOutput(*encoder_outputs)
211
+
212
+ encoder_hidden_states = encoder_outputs[0]
213
+
214
+ encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
215
+
216
+ # else:
217
+ encoder_attention_mask = None
218
+
219
+ if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
220
+ decoder_input_ids = shift_tokens_right(
221
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
222
+ )
223
+
224
+ # Decode
225
+ decoder_outputs = self.decoder(
226
+ input_ids=decoder_input_ids,
227
+ attention_mask=decoder_attention_mask,
228
+ encoder_hidden_states=encoder_hidden_states,
229
+ encoder_attention_mask=encoder_attention_mask,
230
+ inputs_embeds=decoder_inputs_embeds,
231
+ output_attentions=output_attentions,
232
+ output_hidden_states=output_hidden_states,
233
+ use_cache=use_cache,
234
+ past_key_values=past_key_values,
235
+ return_dict=return_dict,
236
+ **kwargs_decoder,
237
+ )
238
+
239
+ # Compute loss independent from decoder (as some shift the logits inside them)
240
+ loss = None
241
+ if labels is not None:
242
+ logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
243
+ loss_fct = nn.CrossEntropyLoss()
244
+ loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1))
245
+
246
+ if not return_dict:
247
+ if loss is not None:
248
+ return (loss,) + decoder_outputs + encoder_outputs
249
+ else:
250
+ return decoder_outputs + encoder_outputs
251
+
252
+ return Seq2SeqLMOutput(
253
+ loss=loss,
254
+ logits=decoder_outputs.logits,
255
+ past_key_values=decoder_outputs.past_key_values,
256
+ decoder_hidden_states=decoder_outputs.hidden_states,
257
+ decoder_attentions=decoder_outputs.attentions,
258
+ cross_attentions=decoder_outputs.cross_attentions,
259
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
260
+ encoder_hidden_states=encoder_outputs.hidden_states,
261
+ encoder_attentions=encoder_outputs.attentions,
262
+ )
263
+
264
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
265
+ return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
266
+
267
+ def prepare_inputs_for_generation(
268
+ self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
269
+ ):
270
+ decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
271
+ decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
272
+ input_dict = {
273
+ "attention_mask": attention_mask,
274
+ "decoder_attention_mask": decoder_attention_mask,
275
+ "decoder_input_ids": decoder_inputs["input_ids"],
276
+ "encoder_outputs": encoder_outputs,
277
+ "past_key_values": decoder_inputs["past_key_values"],
278
+ "use_cache": use_cache,
279
+ }
280
+ return input_dict
281
+
282
+ def resize_token_embeddings(self, *args, **kwargs):
283
+ raise NotImplementedError(
284
+ "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the"
285
+ " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
286
+ )
287
+
288
+ def _reorder_cache(self, past_key_values, beam_idx):
289
+ # apply decoder cache reordering here
290
+ return self.decoder._reorder_cache(past_key_values, beam_idx)
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 224
26
+ }
27
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1000000000000000019884624838656,
17
+ "pad_token": "<|endoftext|>",
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff