take off prints
Browse files- modeling_siglip.py +0 -26
modeling_siglip.py
CHANGED
@@ -194,14 +194,10 @@ class SiglipVisionEmbeddings(nn.Module):
|
|
194 |
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
195 |
|
196 |
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
197 |
-
print("First values of pixel values:", pixel_values[0, 0, :3, :3])
|
198 |
|
199 |
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
|
200 |
embeddings = patch_embeds.flatten(2).transpose(1, 2)
|
201 |
|
202 |
-
print("Shape of embeddings: ", embeddings.shape)
|
203 |
-
print("First values of patch embeddings:", embeddings[0, :3, :3])
|
204 |
-
|
205 |
embeddings = embeddings + self.position_embedding(self.position_ids)
|
206 |
return embeddings
|
207 |
|
@@ -721,14 +717,10 @@ class SiglipTextTransformer(nn.Module):
|
|
721 |
last_hidden_state = encoder_outputs[0]
|
722 |
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
723 |
|
724 |
-
print("Final text hidden states:", last_hidden_state[0, :3, :3])
|
725 |
-
|
726 |
# Assuming "sticky" EOS tokenization, last token is always EOS.
|
727 |
pooled_output = last_hidden_state[:, -1, :]
|
728 |
pooled_output = self.head(pooled_output)
|
729 |
|
730 |
-
print("First values of text pooled output:", pooled_output[0, :3])
|
731 |
-
|
732 |
if not return_dict:
|
733 |
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
734 |
|
@@ -843,13 +835,9 @@ class SiglipVisionTransformer(nn.Module):
|
|
843 |
last_hidden_state = encoder_outputs[0]
|
844 |
last_hidden_state = self.post_layernorm(last_hidden_state)
|
845 |
|
846 |
-
print("First values post layernorm:", last_hidden_state[0, :3, :3])
|
847 |
|
848 |
pooled_output = self.head(last_hidden_state)
|
849 |
|
850 |
-
print("Shape of pooled vision output:", pooled_output.shape)
|
851 |
-
print("First values of pooled vision output:", pooled_output[0, :3])
|
852 |
-
|
853 |
if not return_dict:
|
854 |
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
855 |
|
@@ -876,11 +864,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
|
|
876 |
batch_size = hidden_state.shape[0]
|
877 |
probe = self.probe.repeat(batch_size, 1, 1)
|
878 |
|
879 |
-
print("Shape of probe:", probe.shape)
|
880 |
-
print("First values of probe:", probe[0, :3, :3])
|
881 |
-
print("Shape of hidden state:", hidden_state.shape)
|
882 |
-
print("First values of hidden state:", hidden_state[0, :3, :3])
|
883 |
-
|
884 |
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
|
885 |
|
886 |
residual = hidden_state
|
@@ -1150,20 +1133,11 @@ class SiglipModel(SiglipPreTrainedModel):
|
|
1150 |
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
|
1151 |
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
|
1152 |
|
1153 |
-
print("Normalized image embeds:", image_embeds[0, :3])
|
1154 |
-
print("Normalized text embeds:", text_embeds[0, :3])
|
1155 |
-
|
1156 |
# cosine similarity as logits
|
1157 |
logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
|
1158 |
logits_per_image = logits_per_text.t()
|
1159 |
|
1160 |
-
print("Learned temperature:", self.temperature)
|
1161 |
-
print("Learned bias:", self.bias)
|
1162 |
-
|
1163 |
z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
|
1164 |
-
print("Multiplying by temperature:", z[:3, :3])
|
1165 |
-
|
1166 |
-
print("Logits per image:", logits_per_image[:3, :3])
|
1167 |
|
1168 |
loss = None
|
1169 |
if return_loss:
|
|
|
194 |
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
195 |
|
196 |
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
|
|
197 |
|
198 |
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
|
199 |
embeddings = patch_embeds.flatten(2).transpose(1, 2)
|
200 |
|
|
|
|
|
|
|
201 |
embeddings = embeddings + self.position_embedding(self.position_ids)
|
202 |
return embeddings
|
203 |
|
|
|
717 |
last_hidden_state = encoder_outputs[0]
|
718 |
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
719 |
|
|
|
|
|
720 |
# Assuming "sticky" EOS tokenization, last token is always EOS.
|
721 |
pooled_output = last_hidden_state[:, -1, :]
|
722 |
pooled_output = self.head(pooled_output)
|
723 |
|
|
|
|
|
724 |
if not return_dict:
|
725 |
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
726 |
|
|
|
835 |
last_hidden_state = encoder_outputs[0]
|
836 |
last_hidden_state = self.post_layernorm(last_hidden_state)
|
837 |
|
|
|
838 |
|
839 |
pooled_output = self.head(last_hidden_state)
|
840 |
|
|
|
|
|
|
|
841 |
if not return_dict:
|
842 |
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
843 |
|
|
|
864 |
batch_size = hidden_state.shape[0]
|
865 |
probe = self.probe.repeat(batch_size, 1, 1)
|
866 |
|
|
|
|
|
|
|
|
|
|
|
867 |
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
|
868 |
|
869 |
residual = hidden_state
|
|
|
1133 |
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
|
1134 |
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
|
1135 |
|
|
|
|
|
|
|
1136 |
# cosine similarity as logits
|
1137 |
logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
|
1138 |
logits_per_image = logits_per_text.t()
|
1139 |
|
|
|
|
|
|
|
1140 |
z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
|
|
|
|
|
|
|
1141 |
|
1142 |
loss = None
|
1143 |
if return_loss:
|