HuggingFaceM4
/

siglip-so400m-14-384

Zero-Shot Image Classification

Transformers

PyTorch

siglip

custom_code

Inference Endpoints

Model card Files Files and versions Community

Leyo commited on Oct 20, 2023

Commit

e4bb96b

1 Parent(s): 1e12f07

take off prints

Browse files

Files changed (1) hide show

modeling_siglip.py +0 -26

modeling_siglip.py CHANGED Viewed

@@ -194,14 +194,10 @@ class SiglipVisionEmbeddings(nn.Module):
         self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
-        print("First values of pixel values:", pixel_values[0, 0, :3, :3])
         patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
-        print("Shape of embeddings: ", embeddings.shape)
-        print("First values of patch embeddings:", embeddings[0, :3, :3])
         embeddings = embeddings + self.position_embedding(self.position_ids)
         return embeddings
@@ -721,14 +717,10 @@ class SiglipTextTransformer(nn.Module):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
-        print("Final text hidden states:", last_hidden_state[0, :3, :3])
         # Assuming "sticky" EOS tokenization, last token is always EOS.
         pooled_output = last_hidden_state[:, -1, :]
         pooled_output = self.head(pooled_output)
-        print("First values of text pooled output:", pooled_output[0, :3])
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -843,13 +835,9 @@ class SiglipVisionTransformer(nn.Module):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.post_layernorm(last_hidden_state)
-        print("First values post layernorm:", last_hidden_state[0, :3, :3])
         pooled_output = self.head(last_hidden_state)
-        print("Shape of pooled vision output:", pooled_output.shape)
-        print("First values of pooled vision output:", pooled_output[0, :3])
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -876,11 +864,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
         batch_size = hidden_state.shape[0]
         probe = self.probe.repeat(batch_size, 1, 1)
-        print("Shape of probe:", probe.shape)
-        print("First values of probe:", probe[0, :3, :3])
-        print("Shape of hidden state:", hidden_state.shape)
-        print("First values of hidden state:", hidden_state[0, :3, :3])
         hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
         residual = hidden_state
@@ -1150,20 +1133,11 @@ class SiglipModel(SiglipPreTrainedModel):
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-        print("Normalized image embeds:", image_embeds[0, :3])
-        print("Normalized text embeds:", text_embeds[0, :3])
         # cosine similarity as logits
         logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
         logits_per_image = logits_per_text.t()
-        print("Learned temperature:", self.temperature)
-        print("Learned bias:", self.bias)
         z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
-        print("Multiplying by temperature:", z[:3, :3])
-        print("Logits per image:", logits_per_image[:3, :3])
         loss = None
         if return_loss:

         self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
         embeddings = embeddings + self.position_embedding(self.position_ids)
         return embeddings
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
         # Assuming "sticky" EOS tokenization, last token is always EOS.
         pooled_output = last_hidden_state[:, -1, :]
         pooled_output = self.head(pooled_output)
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.post_layernorm(last_hidden_state)
         pooled_output = self.head(last_hidden_state)
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
         batch_size = hidden_state.shape[0]
         probe = self.probe.repeat(batch_size, 1, 1)
         hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
         residual = hidden_state
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
         # cosine similarity as logits
         logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
         logits_per_image = logits_per_text.t()
         z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
         loss = None
         if return_loss: