doevent commited on
Commit
79d9622
1 Parent(s): 5083fe9

Upload models/blip_vqa.py

Browse files
Files changed (1) hide show
  1. models/blip_vqa.py +186 -0
models/blip_vqa.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.med import BertConfig, BertModel, BertLMHeadModel
2
+ from models.blip import create_vit, init_tokenizer, load_checkpoint
3
+
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ from transformers import BertTokenizer
8
+ import numpy as np
9
+
10
+ class BLIP_VQA(nn.Module):
11
+ def __init__(self,
12
+ med_config = 'configs/med_config.json',
13
+ image_size = 480,
14
+ vit = 'base',
15
+ vit_grad_ckpt = False,
16
+ vit_ckpt_layer = 0,
17
+ ):
18
+ """
19
+ Args:
20
+ med_config (str): path for the mixture of encoder-decoder model's configuration file
21
+ image_size (int): input image size
22
+ vit (str): model size of vision transformer
23
+ """
24
+ super().__init__()
25
+
26
+ self.visual_encoder, vision_width = create_vit(vit, image_size, vit_grad_ckpt, vit_ckpt_layer, drop_path_rate=0.1)
27
+ self.tokenizer = init_tokenizer()
28
+
29
+ encoder_config = BertConfig.from_json_file(med_config)
30
+ encoder_config.encoder_width = vision_width
31
+ self.text_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
32
+
33
+ decoder_config = BertConfig.from_json_file(med_config)
34
+ self.text_decoder = BertLMHeadModel(config=decoder_config)
35
+
36
+
37
+ def forward(self, image, question, answer=None, n=None, weights=None, train=True, inference='rank', k_test=128):
38
+
39
+ image_embeds = self.visual_encoder(image)
40
+ image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
41
+
42
+ question = self.tokenizer(question, padding='longest', truncation=True, max_length=35,
43
+ return_tensors="pt").to(image.device)
44
+ question.input_ids[:,0] = self.tokenizer.enc_token_id
45
+
46
+ if train:
47
+ '''
48
+ n: number of answers for each question
49
+ weights: weight for each answer
50
+ '''
51
+ answer = self.tokenizer(answer, padding='longest', return_tensors="pt").to(image.device)
52
+ answer.input_ids[:,0] = self.tokenizer.bos_token_id
53
+ answer_targets = answer.input_ids.masked_fill(answer.input_ids == self.tokenizer.pad_token_id, -100)
54
+
55
+ question_output = self.text_encoder(question.input_ids,
56
+ attention_mask = question.attention_mask,
57
+ encoder_hidden_states = image_embeds,
58
+ encoder_attention_mask = image_atts,
59
+ return_dict = True)
60
+
61
+ question_states = []
62
+ question_atts = []
63
+ for b, n in enumerate(n):
64
+ question_states += [question_output.last_hidden_state[b]]*n
65
+ question_atts += [question.attention_mask[b]]*n
66
+ question_states = torch.stack(question_states,0)
67
+ question_atts = torch.stack(question_atts,0)
68
+
69
+ answer_output = self.text_decoder(answer.input_ids,
70
+ attention_mask = answer.attention_mask,
71
+ encoder_hidden_states = question_states,
72
+ encoder_attention_mask = question_atts,
73
+ labels = answer_targets,
74
+ return_dict = True,
75
+ reduction = 'none',
76
+ )
77
+
78
+ loss = weights * answer_output.loss
79
+ loss = loss.sum()/image.size(0)
80
+
81
+ return loss
82
+
83
+
84
+ else:
85
+ question_output = self.text_encoder(question.input_ids,
86
+ attention_mask = question.attention_mask,
87
+ encoder_hidden_states = image_embeds,
88
+ encoder_attention_mask = image_atts,
89
+ return_dict = True)
90
+
91
+ if inference=='generate':
92
+ num_beams = 3
93
+ question_states = question_output.last_hidden_state.repeat_interleave(num_beams,dim=0)
94
+ question_atts = torch.ones(question_states.size()[:-1],dtype=torch.long).to(question_states.device)
95
+ model_kwargs = {"encoder_hidden_states": question_states, "encoder_attention_mask":question_atts}
96
+
97
+ bos_ids = torch.full((image.size(0),1),fill_value=self.tokenizer.bos_token_id,device=image.device)
98
+
99
+ outputs = self.text_decoder.generate(input_ids=bos_ids,
100
+ max_length=10,
101
+ min_length=1,
102
+ num_beams=num_beams,
103
+ eos_token_id=self.tokenizer.sep_token_id,
104
+ pad_token_id=self.tokenizer.pad_token_id,
105
+ **model_kwargs)
106
+
107
+ answers = []
108
+ for output in outputs:
109
+ answer = self.tokenizer.decode(output, skip_special_tokens=True)
110
+ answers.append(answer)
111
+ return answers
112
+
113
+ elif inference=='rank':
114
+ max_ids = self.rank_answer(question_output.last_hidden_state, question.attention_mask,
115
+ answer.input_ids, answer.attention_mask, k_test)
116
+ return max_ids
117
+
118
+
119
+
120
+ def rank_answer(self, question_states, question_atts, answer_ids, answer_atts, k):
121
+
122
+ num_ques = question_states.size(0)
123
+ start_ids = answer_ids[0,0].repeat(num_ques,1) # bos token
124
+
125
+ start_output = self.text_decoder(start_ids,
126
+ encoder_hidden_states = question_states,
127
+ encoder_attention_mask = question_atts,
128
+ return_dict = True,
129
+ reduction = 'none')
130
+ logits = start_output.logits[:,0,:] # first token's logit
131
+
132
+ # topk_probs: top-k probability
133
+ # topk_ids: [num_question, k]
134
+ answer_first_token = answer_ids[:,1]
135
+ prob_first_token = F.softmax(logits,dim=1).index_select(dim=1, index=answer_first_token)
136
+ topk_probs, topk_ids = prob_first_token.topk(k,dim=1)
137
+
138
+ # answer input: [num_question*k, answer_len]
139
+ input_ids = []
140
+ input_atts = []
141
+ for b, topk_id in enumerate(topk_ids):
142
+ input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
143
+ input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
144
+ input_ids = torch.cat(input_ids,dim=0)
145
+ input_atts = torch.cat(input_atts,dim=0)
146
+
147
+ targets_ids = input_ids.masked_fill(input_ids == self.tokenizer.pad_token_id, -100)
148
+
149
+ # repeat encoder's output for top-k answers
150
+ question_states = tile(question_states, 0, k)
151
+ question_atts = tile(question_atts, 0, k)
152
+
153
+ output = self.text_decoder(input_ids,
154
+ attention_mask = input_atts,
155
+ encoder_hidden_states = question_states,
156
+ encoder_attention_mask = question_atts,
157
+ labels = targets_ids,
158
+ return_dict = True,
159
+ reduction = 'none')
160
+
161
+ log_probs_sum = -output.loss
162
+ log_probs_sum = log_probs_sum.view(num_ques,k)
163
+
164
+ max_topk_ids = log_probs_sum.argmax(dim=1)
165
+ max_ids = topk_ids[max_topk_ids>=0,max_topk_ids]
166
+
167
+ return max_ids
168
+
169
+
170
+ def blip_vqa(pretrained='',**kwargs):
171
+ model = BLIP_VQA(**kwargs)
172
+ if pretrained:
173
+ model,msg = load_checkpoint(model,pretrained)
174
+ # assert(len(msg.missing_keys)==0)
175
+ return model
176
+
177
+
178
+ def tile(x, dim, n_tile):
179
+ init_dim = x.size(dim)
180
+ repeat_idx = [1] * x.dim()
181
+ repeat_idx[dim] = n_tile
182
+ x = x.repeat(*(repeat_idx))
183
+ order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
184
+ return torch.index_select(x, dim, order_index.to(x.device))
185
+
186
+