Spaces:

nikigoli
/

countgd

Running on T4

App Files Files Community

nikigoli commited on Jul 14

Commit

a5e9c89

•

1 Parent(s): 64980f3

Added print statements in transformer file for debugging

Browse files

Files changed (1) hide show

models/GroundingDINO/transformer.py +14 -4

models/GroundingDINO/transformer.py CHANGED Viewed

@@ -237,6 +237,7 @@ class Transformer(nn.Module):
         """
         # prepare input for encoder
         src_flatten = []
         mask_flatten = []
         lvl_pos_embed_flatten = []
@@ -273,7 +274,7 @@ class Transformer(nn.Module):
         #########################################################
         # Begin Encoder
         #########################################################
         memory, memory_text = self.encoder(
             src_flatten,
             pos=lvl_pos_embed_flatten,
@@ -287,7 +288,7 @@ class Transformer(nn.Module):
             position_ids=text_dict["position_ids"],
             text_self_attention_masks=text_dict["text_self_attention_masks"],
         )
         #########################################################
         # End Encoder
         # - memory: bs, \sum{hw}, c
@@ -302,9 +303,11 @@ class Transformer(nn.Module):
         #         import ipdb; ipdb.set_trace()
         if self.two_stage_type == "standard":  # 把encoder的输出作为proposal
             output_memory, output_proposals = gen_encoder_output_proposals(
                 memory, mask_flatten, spatial_shapes
             )
             output_memory = self.enc_output_norm(self.enc_output(output_memory))
             if text_dict is not None:
@@ -321,24 +324,29 @@ class Transformer(nn.Module):
             topk = self.num_queries
             topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
             # gather boxes
             refpoint_embed_undetach = torch.gather(
                 enc_outputs_coord_unselected,
                 1,
                 topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
             )  # unsigmoid
             refpoint_embed_ = refpoint_embed_undetach.detach()
             init_box_proposal = torch.gather(
                 output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
             ).sigmoid()  # sigmoid
             # gather tgt
             tgt_undetach = torch.gather(
                 output_memory,
                 1,
                 topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model),
             )
             if self.embed_init_tgt:
                 tgt_ = (
                     self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
@@ -393,6 +401,7 @@ class Transformer(nn.Module):
         # memory  torch.Size([2, 16320, 256])
         # import pdb;pdb.set_trace()
         hs, references = self.decoder(
             tgt=tgt.transpose(0, 1),
             memory=memory.transpose(0, 1),
@@ -407,6 +416,7 @@ class Transformer(nn.Module):
             text_attention_mask=~text_dict["text_token_mask"],
             # we ~ the mask . False means use the token; True means pad the token
         )
         #########################################################
         # End Decoder
         # hs: n_dec, bs, nq, d_model

         """
         # prepare input for encoder
+        print("inside transformer forward")
         src_flatten = []
         mask_flatten = []
         lvl_pos_embed_flatten = []
         #########################################################
         # Begin Encoder
         #########################################################
+        print("begin transformer encoder")
         memory, memory_text = self.encoder(
             src_flatten,
             pos=lvl_pos_embed_flatten,
             position_ids=text_dict["position_ids"],
             text_self_attention_masks=text_dict["text_self_attention_masks"],
         )
+        print("got encoder output")
         #########################################################
         # End Encoder
         # - memory: bs, \sum{hw}, c
         #         import ipdb; ipdb.set_trace()
         if self.two_stage_type == "standard":  # 把encoder的输出作为proposal
+            print("standard two stage")
             output_memory, output_proposals = gen_encoder_output_proposals(
                 memory, mask_flatten, spatial_shapes
             )
+            print("got output proposals")
             output_memory = self.enc_output_norm(self.enc_output(output_memory))
             if text_dict is not None:
             topk = self.num_queries
             topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
+            print("got topk proposals")
             # gather boxes
+            print("gather 1")
             refpoint_embed_undetach = torch.gather(
                 enc_outputs_coord_unselected,
                 1,
                 topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
             )  # unsigmoid
+            print("gathered 1")
             refpoint_embed_ = refpoint_embed_undetach.detach()
+            print("gather 2")
             init_box_proposal = torch.gather(
                 output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
             ).sigmoid()  # sigmoid
+            print("gathered 2")
+            print("gather 3")
             # gather tgt
             tgt_undetach = torch.gather(
                 output_memory,
                 1,
                 topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model),
             )
+            print("gathered 3")
             if self.embed_init_tgt:
                 tgt_ = (
                     self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
         # memory  torch.Size([2, 16320, 256])
         # import pdb;pdb.set_trace()
+            print("going through decoder")
         hs, references = self.decoder(
             tgt=tgt.transpose(0, 1),
             memory=memory.transpose(0, 1),
             text_attention_mask=~text_dict["text_token_mask"],
             # we ~ the mask . False means use the token; True means pad the token
         )
+        print("got decoder output")
         #########################################################
         # End Decoder
         # hs: n_dec, bs, nq, d_model