Leyo commited on
Commit
e4bb96b
·
1 Parent(s): 1e12f07

take off prints

Browse files
Files changed (1) hide show
  1. modeling_siglip.py +0 -26
modeling_siglip.py CHANGED
@@ -194,14 +194,10 @@ class SiglipVisionEmbeddings(nn.Module):
194
  self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
195
 
196
  def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
197
- print("First values of pixel values:", pixel_values[0, 0, :3, :3])
198
 
199
  patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
200
  embeddings = patch_embeds.flatten(2).transpose(1, 2)
201
 
202
- print("Shape of embeddings: ", embeddings.shape)
203
- print("First values of patch embeddings:", embeddings[0, :3, :3])
204
-
205
  embeddings = embeddings + self.position_embedding(self.position_ids)
206
  return embeddings
207
 
@@ -721,14 +717,10 @@ class SiglipTextTransformer(nn.Module):
721
  last_hidden_state = encoder_outputs[0]
722
  last_hidden_state = self.final_layer_norm(last_hidden_state)
723
 
724
- print("Final text hidden states:", last_hidden_state[0, :3, :3])
725
-
726
  # Assuming "sticky" EOS tokenization, last token is always EOS.
727
  pooled_output = last_hidden_state[:, -1, :]
728
  pooled_output = self.head(pooled_output)
729
 
730
- print("First values of text pooled output:", pooled_output[0, :3])
731
-
732
  if not return_dict:
733
  return (last_hidden_state, pooled_output) + encoder_outputs[1:]
734
 
@@ -843,13 +835,9 @@ class SiglipVisionTransformer(nn.Module):
843
  last_hidden_state = encoder_outputs[0]
844
  last_hidden_state = self.post_layernorm(last_hidden_state)
845
 
846
- print("First values post layernorm:", last_hidden_state[0, :3, :3])
847
 
848
  pooled_output = self.head(last_hidden_state)
849
 
850
- print("Shape of pooled vision output:", pooled_output.shape)
851
- print("First values of pooled vision output:", pooled_output[0, :3])
852
-
853
  if not return_dict:
854
  return (last_hidden_state, pooled_output) + encoder_outputs[1:]
855
 
@@ -876,11 +864,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
876
  batch_size = hidden_state.shape[0]
877
  probe = self.probe.repeat(batch_size, 1, 1)
878
 
879
- print("Shape of probe:", probe.shape)
880
- print("First values of probe:", probe[0, :3, :3])
881
- print("Shape of hidden state:", hidden_state.shape)
882
- print("First values of hidden state:", hidden_state[0, :3, :3])
883
-
884
  hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
885
 
886
  residual = hidden_state
@@ -1150,20 +1133,11 @@ class SiglipModel(SiglipPreTrainedModel):
1150
  image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
1151
  text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
1152
 
1153
- print("Normalized image embeds:", image_embeds[0, :3])
1154
- print("Normalized text embeds:", text_embeds[0, :3])
1155
-
1156
  # cosine similarity as logits
1157
  logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
1158
  logits_per_image = logits_per_text.t()
1159
 
1160
- print("Learned temperature:", self.temperature)
1161
- print("Learned bias:", self.bias)
1162
-
1163
  z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
1164
- print("Multiplying by temperature:", z[:3, :3])
1165
-
1166
- print("Logits per image:", logits_per_image[:3, :3])
1167
 
1168
  loss = None
1169
  if return_loss:
 
194
  self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
195
 
196
  def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
 
197
 
198
  patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
199
  embeddings = patch_embeds.flatten(2).transpose(1, 2)
200
 
 
 
 
201
  embeddings = embeddings + self.position_embedding(self.position_ids)
202
  return embeddings
203
 
 
717
  last_hidden_state = encoder_outputs[0]
718
  last_hidden_state = self.final_layer_norm(last_hidden_state)
719
 
 
 
720
  # Assuming "sticky" EOS tokenization, last token is always EOS.
721
  pooled_output = last_hidden_state[:, -1, :]
722
  pooled_output = self.head(pooled_output)
723
 
 
 
724
  if not return_dict:
725
  return (last_hidden_state, pooled_output) + encoder_outputs[1:]
726
 
 
835
  last_hidden_state = encoder_outputs[0]
836
  last_hidden_state = self.post_layernorm(last_hidden_state)
837
 
 
838
 
839
  pooled_output = self.head(last_hidden_state)
840
 
 
 
 
841
  if not return_dict:
842
  return (last_hidden_state, pooled_output) + encoder_outputs[1:]
843
 
 
864
  batch_size = hidden_state.shape[0]
865
  probe = self.probe.repeat(batch_size, 1, 1)
866
 
 
 
 
 
 
867
  hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
868
 
869
  residual = hidden_state
 
1133
  image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
1134
  text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
1135
 
 
 
 
1136
  # cosine similarity as logits
1137
  logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
1138
  logits_per_image = logits_per_text.t()
1139
 
 
 
 
1140
  z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
 
 
 
1141
 
1142
  loss = None
1143
  if return_loss: