Spaces:

fun-research
/

FC-CLIP

Running

App Files Files Community

yucornetto commited on Jul 6, 2023

Commit

a5b4e6b

1 Parent(s): 6bd742d

Upload 106 files

Browse files

Files changed (3) hide show

app.py +2 -3
demo_all_text_embedding_cache.pth +3 -0
fcclip/fcclip.py +60 -21

app.py CHANGED Viewed

@@ -27,8 +27,6 @@ from detectron2.data import MetadataCatalog
 from detectron2.projects.deeplab import add_deeplab_config
-coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic")
 # import FCCLIP project
 from fcclip import add_maskformer2_config, add_fcclip_config
 from demo.predictor import DefaultPredictor, OpenVocabVisualizer
@@ -46,6 +44,7 @@ add_maskformer2_config(cfg)
 add_fcclip_config(cfg)
 cfg.merge_from_file("configs/coco/panoptic-segmentation/fcclip/fcclip_convnext_large_eval_ade20k.yaml")
 os.system("gdown 1-91PIns86vyNaL3CzMmDD39zKGnPMtvj")
 cfg.MODEL.WEIGHTS = './fcclip_cocopan.pth'
 cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = False
 cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
@@ -160,7 +159,7 @@ def inference(image_path, vocab, label_list):
     im = cv2.imread(image_path)
     outputs = predictor(im)
-    v = OpenVocabVisualizer(im[:, :, ::-1], demo_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
     panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), outputs["panoptic_seg"][1]).get_image()
     return Image.fromarray(np.uint8(panoptic_result)).convert('RGB')

 from detectron2.projects.deeplab import add_deeplab_config
 # import FCCLIP project
 from fcclip import add_maskformer2_config, add_fcclip_config
 from demo.predictor import DefaultPredictor, OpenVocabVisualizer
 add_fcclip_config(cfg)
 cfg.merge_from_file("configs/coco/panoptic-segmentation/fcclip/fcclip_convnext_large_eval_ade20k.yaml")
 os.system("gdown 1-91PIns86vyNaL3CzMmDD39zKGnPMtvj")
+os.system("gdown 1-91PIns86vyNaL3CzMmDD39zKGnPMtvj")
 cfg.MODEL.WEIGHTS = './fcclip_cocopan.pth'
 cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = False
 cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
     im = cv2.imread(image_path)
     outputs = predictor(im)
+    v = OpenVocabVisualizer(im[:, :, ::-1], demo_metadata, scale=1.0, instance_mode=ColorMode.IMAGE)
     panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), outputs["panoptic_seg"][1]).get_image()
     return Image.fromarray(np.uint8(panoptic_result)).convert('RGB')

demo_all_text_embedding_cache.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee4c83884a03f41e1078a5b0916f6a26606258c0031e4e22e74c93c6672e9c9
+size 7848107

fcclip/fcclip.py CHANGED Viewed

@@ -18,6 +18,7 @@ from .modeling.matcher import HungarianMatcher
 from .modeling.transformer_decoder.fcclip_transformer_decoder import MaskPooling, get_classification_logits
 VILD_PROMPT = [
     "a photo of a {}.",
     "This is a photo of a {}",
@@ -35,6 +36,20 @@ VILD_PROMPT = [
     "There is a large {} in the scene.",
 ]
 @META_ARCH_REGISTRY.register()
 class FCCLIP(nn.Module):
@@ -129,14 +144,15 @@ class FCCLIP(nn.Module):
         _, self.train_num_templates, self.train_class_names = self.prepare_class_names_from_metadata(train_metadata, train_metadata)
         self.category_overlapping_mask, self.test_num_templates, self.test_class_names = self.prepare_class_names_from_metadata(test_metadata, train_metadata)
     def prepare_class_names_from_metadata(self, metadata, train_metadata):
-        def split_labels(x):
-            res = []
-            for x_ in x:
-                x_ = x_.replace(', ', ',')
-                x_ = x_.split(',') # there can be multiple synonyms for single class
-                res.append(x_)
-            return res
         # get text classifier
         try:
             class_names = split_labels(metadata.stuff_classes) # it includes both thing and stuff
@@ -152,13 +168,6 @@ class FCCLIP(nn.Module):
             category_overlapping_list.append(is_overlapping)
         category_overlapping_mask = torch.tensor(
             category_overlapping_list, dtype=torch.long)
-        def fill_all_templates_ensemble(x_=''):
-            res = []
-            for x in x_:
-                for template in VILD_PROMPT:
-                    res.append(template.format(x))
-            return res, len(res) // len(VILD_PROMPT)
         num_templates = []
         templated_class_names = []
@@ -195,17 +204,47 @@ class FCCLIP(nn.Module):
             return self.train_text_classifier, self.train_num_templates
         else:
             if self.test_text_classifier is None:
                 text_classifier = []
                 # this is needed to avoid oom, which may happen when num of class is large
                 bs = 128
-                for idx in range(0, len(self.test_class_names), bs):
-                    text_classifier.append(self.backbone.get_text_classifier(self.test_class_names[idx:idx+bs], self.device).detach())
-                text_classifier = torch.cat(text_classifier, dim=0)
-                # average across templates and normalization.
-                text_classifier /= text_classifier.norm(dim=-1, keepdim=True)
-                text_classifier = text_classifier.reshape(text_classifier.shape[0]//len(VILD_PROMPT), len(VILD_PROMPT), text_classifier.shape[-1]).mean(1)
-                text_classifier /= text_classifier.norm(dim=-1, keepdim=True)
                 self.test_text_classifier = text_classifier
             return self.test_text_classifier, self.test_num_templates

 from .modeling.transformer_decoder.fcclip_transformer_decoder import MaskPooling, get_classification_logits
+import os
 VILD_PROMPT = [
     "a photo of a {}.",
     "This is a photo of a {}",
     "There is a large {} in the scene.",
 ]
+def split_labels(x):
+    res = []
+    for x_ in x:
+        x_ = x_.replace(', ', ',')
+        x_ = x_.split(',') # there can be multiple synonyms for single class
+        res.append(x_)
+    return res
+def fill_all_templates_ensemble(x_=''):
+    res = []
+    for x in x_:
+        for template in VILD_PROMPT:
+            res.append(template.format(x))
+    return res, len(res) // len(VILD_PROMPT)
 @META_ARCH_REGISTRY.register()
 class FCCLIP(nn.Module):
         _, self.train_num_templates, self.train_class_names = self.prepare_class_names_from_metadata(train_metadata, train_metadata)
         self.category_overlapping_mask, self.test_num_templates, self.test_class_names = self.prepare_class_names_from_metadata(test_metadata, train_metadata)
+        self.demo_all_text_embedding_cache = {}
+        # This consists of COCO, ADE20K, LVIS
+        if os.path.exists("demo_all_text_embedding_cache.pth"):
+            # key: str of class name, value: tensor in shape of C
+            self.demo_all_text_embedding_cache = torch.load("demo_all_text_embedding_cache.pth", map_location=self.device)
+            self.demo_all_text_embedding_cache = {k:v.to(self.device) for k,v in self.demo_all_text_embedding_cache.items()}
     def prepare_class_names_from_metadata(self, metadata, train_metadata):
         # get text classifier
         try:
             class_names = split_labels(metadata.stuff_classes) # it includes both thing and stuff
             category_overlapping_list.append(is_overlapping)
         category_overlapping_mask = torch.tensor(
             category_overlapping_list, dtype=torch.long)
         num_templates = []
         templated_class_names = []
             return self.train_text_classifier, self.train_num_templates
         else:
             if self.test_text_classifier is None:
+                try:
+                    nontemplated_class_names = split_labels(self.test_metadata.stuff_classes) # it includes both thing and stuff
+                except:
+                    # this could be for insseg, where only thing_classes are available
+                    nontemplated_class_names = split_labels(self.test_metadata.thing_classes)
+                text2classifier = {}
+                test_class_names = []
+                uncached_class_name = []
                 text_classifier = []
+                # exclude those already in cache
+                for class_names in nontemplated_class_names:
+                    for class_name in class_names:
+                        if class_name in self.demo_all_text_embedding_cache:
+                            text2classifier[class_name] = self.demo_all_text_embedding_cache[class_name].to(self.device)
+                        else:
+                            test_class_names += fill_all_templates_ensemble([class_name])[0]
+                            uncached_class_name.append(class_name)
+                print("Uncached texts:", len(uncached_class_name), uncached_class_name, test_class_names)
                 # this is needed to avoid oom, which may happen when num of class is large
                 bs = 128
+                for idx in range(0, len(test_class_names), bs):
+                    text_classifier.append(self.backbone.get_text_classifier(test_class_names[idx:idx+bs], self.device).detach())
+                if len(text_classifier) > 0:
+                    text_classifier = torch.cat(text_classifier, dim=0)
+                    # average across templates and normalization.
+                    text_classifier /= text_classifier.norm(dim=-1, keepdim=True)
+                    text_classifier = text_classifier.reshape(text_classifier.shape[0]//len(VILD_PROMPT), len(VILD_PROMPT), text_classifier.shape[-1]).mean(1)
+                    text_classifier /= text_classifier.norm(dim=-1, keepdim=True)
+                    assert text_classifier.shape[0] == len(uncached_class_name)
+                    for idx in range(len(uncached_class_name)):
+                        self.demo_all_text_embedding_cache[uncached_class_name[idx]] = text_classifier[idx]
+                        text2classifier[uncached_class_name[idx]] = text_classifier[idx]
+                    #torch.save({k:v for k, v in self.demo_all_text_embedding_cache.items()}, "demo_all_text_embedding_cache.pth")
+                text_classifier = []
+                for class_names in nontemplated_class_names:
+                    for text in class_names:
+                        text_classifier.append(text2classifier[text].to(self.device))
+                text_classifier = torch.stack(text_classifier, dim=0).to(self.device)
                 self.test_text_classifier = text_classifier
             return self.test_text_classifier, self.test_num_templates