Spaces:

TencentARC
/

Caption-Anything

Runtime error

App Files Files Community

ttengwang commited on Apr 12, 2023

Commit

13c1c2e

1 Parent(s): eeb5fe8

assign api key and img embed from different users to different sessions

Browse files

Files changed (4) hide show

app.py +141 -60
caption_anything.py +21 -10
segmenter/__init__.py +4 -2
segmenter/base_segmenter.py +7 -4

app.py CHANGED Viewed

@@ -15,6 +15,10 @@ import copy
 from tools import mask_painter
 from PIL import Image
 import os
 def download_checkpoint(url, folder, filename):
     os.makedirs(folder, exist_ok=True)
@@ -50,37 +54,74 @@ examples = [
 ]
 args = parse_augment()
-args.disable_reuse_features = True
 # args.device = 'cuda:5'
-# args.disable_gpt = False
-# args.enable_reduce_tokens = True
 # args.port=20322
-model = CaptionAnything(args)
-def init_openai_api_key(api_key):
-    # os.environ['OPENAI_API_KEY'] = api_key
-    model.init_refiner(api_key)
-    openai_available = model.text_refiner is not None
-    return gr.update(visible = openai_available), gr.update(visible = openai_available), gr.update(visible = openai_available), gr.update(visible = True), gr.update(visible = True)
-def get_prompt(chat_input, click_state):
-    points = click_state[0]
-    labels = click_state[1]
     inputs = json.loads(chat_input)
-    for input in inputs:
-        points.append(input[:2])
-        labels.append(input[2])
     prompt = {
         "prompt_type":["click"],
-        "input_point":points,
-        "input_label":labels,
         "multimask_output":"True",
     }
     return prompt
-def chat_with_points(chat_input, click_state, state):
-    if model.text_refiner is None:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
         return state, state
@@ -96,11 +137,26 @@ def chat_with_points(chat_input, click_state, state):
     else:
         prev_visual_context = 'no point exists.'
     chat_prompt = point_chat_prompt.format(**{"points_with_caps": prev_visual_context, "chat_input": chat_input})
-    response = model.text_refiner.llm(chat_prompt)
     state = state + [(chat_input, response)]
     return state, state
-def inference_seg_cap(image_input, point_prompt, language, sentiment, factuality, length, state, click_state, evt:gr.SelectData):
     if point_prompt == 'Positive':
         coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
@@ -114,7 +170,7 @@ def inference_seg_cap(image_input, point_prompt, language, sentiment, factuality
     # click_coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     # chat_input = click_coordinate
-    prompt = get_prompt(coordinate, click_state)
     print('prompt: ', prompt, 'controls: ', controls)
     out = model.inference(image_input, prompt, controls)
@@ -123,12 +179,12 @@ def inference_seg_cap(image_input, point_prompt, language, sentiment, factuality
     #     state = state + [(f'{k}: {v}', None)]
     state = state + [("caption: {}".format(out['generated_captions']['raw_caption']), None)]
     wiki = out['generated_captions'].get('wiki', "")
-    click_state[2].append(out['generated_captions']['raw_caption'])
     text = out['generated_captions']['raw_caption']
     # draw = ImageDraw.Draw(image_input)
     # draw.text((evt.index[0], evt.index[1]), text, textcolor=(0,0,255), text_size=120)
-    input_mask = np.array(Image.open(out['mask_save_path']).convert('P'))
     image_input = mask_painter(np.array(image_input), input_mask)
     origin_image_input = image_input
     image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]))
@@ -151,10 +207,19 @@ def upload_callback(image_input, state):
     if ratio < 1.0:
         image_input = image_input.resize((int(width * ratio), int(height * ratio)))
         print('Scaling input image to {}'.format(image_input.size))
-    model.segmenter.image = None
-    model.segmenter.image_embedding = None
     model.segmenter.set_image(image_input)
-    return state, image_input, click_state, image_input
 with gr.Blocks(
     css='''
@@ -165,6 +230,10 @@ with gr.Blocks(
     state = gr.State([])
     click_state = gr.State([[],[],[]])
     origin_image = gr.State(None)
     gr.Markdown(title)
     gr.Markdown(description)
@@ -175,17 +244,24 @@ with gr.Blocks(
                 image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                 example_image = gr.Image(type="pil", interactive=False, visible=False)
                 with gr.Row(scale=1.0):
-                    point_prompt = gr.Radio(
-                        choices=["Positive",  "Negative"],
-                        value="Positive",
-                        label="Point Prompt",
-                        interactive=True)
-                    clear_button_clike = gr.Button(value="Clear Clicks", interactive=True)
-                    clear_button_image = gr.Button(value="Clear Image", interactive=True)
             with gr.Column(visible=False) as modules_need_gpt:
                 with gr.Row(scale=1.0):
                     language = gr.Dropdown(['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"], value="English", label="Language", interactive=True)
                     sentiment = gr.Radio(
                         choices=["Positive", "Natural", "Negative"],
                         value="Natural",
@@ -206,27 +282,36 @@ with gr.Blocks(
                         step=1,
                         interactive=True,
                         label="Length",
-                    )
         with gr.Column(scale=0.5):
             openai_api_key = gr.Textbox(
-                placeholder="Input openAI API key and press Enter (Input blank will disable GPT)",
                 show_label=False,
                 label = "OpenAI API Key",
                 lines=1,
-                type="password"
-                )
             with gr.Column(visible=False) as modules_need_gpt2:
-                wiki_output = gr.Textbox(lines=6, label="Wiki")
             with gr.Column(visible=False) as modules_not_need_gpt2:
-                chatbot = gr.Chatbot(label="Chat about Selected Object",).style(height=450,scale=0.5)
                 with gr.Column(visible=False) as modules_need_gpt3:
                     chat_input = gr.Textbox(lines=1, label="Chat Input")
                     with gr.Row():
                         clear_button_text = gr.Button(value="Clear Text", interactive=True)
                         submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
-    openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2])
     clear_button_clike.click(
         lambda x: ([[], [], []], x, ""),
         [origin_image],
@@ -256,33 +341,29 @@ with gr.Blocks(
         show_progress=False
     )
-    def example_callback(x):
-        model.image_embedding = None
-        return x
-    gr.Examples(
-        examples=examples,
-        inputs=[example_image],
-    )
-    image_input.upload(upload_callback,[image_input, state], [state, origin_image, click_state, image_input])
-    chat_input.submit(chat_with_points, [chat_input, click_state, state], [chatbot, state])
-    example_image.change(upload_callback,[example_image, state], [state, origin_image, click_state, image_input])
     # select coordinate
     image_input.select(inference_seg_cap,
         inputs=[
         origin_image,
         point_prompt,
         language,
         sentiment,
         factuality,
         length,
         state,
-        click_state
         ],
         outputs=[chatbot, state, click_state, chat_input, image_input, wiki_output],
         show_progress=False, queue=True)
-iface.queue(concurrency_count=1, api_open=False)
-iface.launch(server_name="0.0.0.0", enable_queue=True)

 from tools import mask_painter
 from PIL import Image
 import os
+from captioner import build_captioner
+from segment_anything import sam_model_registry
+from text_refiner import build_text_refiner
+from segmenter import build_segmenter
 def download_checkpoint(url, folder, filename):
     os.makedirs(folder, exist_ok=True)
 ]
 args = parse_augment()
 # args.device = 'cuda:5'
+# args.disable_gpt = True
+# args.enable_reduce_tokens = False
 # args.port=20322
+# args.captioner = 'blip'
+# args.regular_box = True
+shared_captioner = build_captioner(args.captioner, args.device, args)
+shared_sam_model = sam_model_registry['vit_h'](checkpoint=args.segmenter_checkpoint).to(args.device)
+def build_caption_anything_with_models(args, api_key="", captioner=None, sam_model=None, text_refiner=None, session_id=None):
+    segmenter = build_segmenter(args.segmenter, args.device, args, model=sam_model)
+    captioner = captioner
+    if session_id is not None:
+        print('Init caption anything for session {}'.format(session_id))
+    return CaptionAnything(args, api_key, captioner=captioner, segmenter=segmenter, text_refiner=text_refiner)
+def init_openai_api_key(api_key=""):
+    text_refiner = None
+    if api_key and len(api_key) > 30:
+        try:
+            text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
+            text_refiner.llm('hi') # test
+        except:
+            text_refiner = None
+    openai_available = text_refiner is not None
+    return gr.update(visible = openai_available), gr.update(visible = openai_available), gr.update(visible = openai_available), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), text_refiner
+def get_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
+    if click_mode == 'Continuous':
+        points = click_state[0]
+        labels = click_state[1]
+        for input in inputs:
+            points.append(input[:2])
+            labels.append(input[2])
+    elif click_mode == 'Single':
+        points = []
+        labels = []
+        for input in inputs:
+            points.append(input[:2])
+            labels.append(input[2])
+        click_state[0] = points
+        click_state[1] = labels
+    else:
+        raise NotImplementedError
     prompt = {
         "prompt_type":["click"],
+        "input_point":click_state[0],
+        "input_label":click_state[1],
         "multimask_output":"True",
     }
     return prompt
+def update_click_state(click_state, caption, click_mode):
+    if click_mode == 'Continuous':
+        click_state[2].append(caption)
+    elif click_mode == 'Single':
+        click_state[2] = [caption]
+    else:
+        raise NotImplementedError
+def chat_with_points(chat_input, click_state, state, text_refiner):
+    if text_refiner is None:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
         return state, state
     else:
         prev_visual_context = 'no point exists.'
     chat_prompt = point_chat_prompt.format(**{"points_with_caps": prev_visual_context, "chat_input": chat_input})
+    response = text_refiner.llm(chat_prompt)
     state = state + [(chat_input, response)]
     return state, state
+def inference_seg_cap(image_input, point_prompt, click_mode, language, sentiment, factuality,
+                length, image_embedding, state, click_state, original_size, input_size, text_refiner, evt:gr.SelectData):
+    model = build_caption_anything_with_models(
+        args,
+        api_key="",
+        captioner=shared_captioner,
+        sam_model=shared_sam_model,
+        text_refiner=text_refiner,
+        session_id=iface.app_id
+    )
+    model.segmenter.image_embedding = image_embedding
+    model.segmenter.predictor.original_size = original_size
+    model.segmenter.predictor.input_size = input_size
+    model.segmenter.predictor.is_image_set = True
     if point_prompt == 'Positive':
         coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     # click_coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     # chat_input = click_coordinate
+    prompt = get_prompt(coordinate, click_state, click_mode)
     print('prompt: ', prompt, 'controls: ', controls)
     out = model.inference(image_input, prompt, controls)
     #     state = state + [(f'{k}: {v}', None)]
     state = state + [("caption: {}".format(out['generated_captions']['raw_caption']), None)]
     wiki = out['generated_captions'].get('wiki', "")
+    update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     # draw = ImageDraw.Draw(image_input)
     # draw.text((evt.index[0], evt.index[1]), text, textcolor=(0,0,255), text_size=120)
+    input_mask = np.array(out['mask'].convert('P'))
     image_input = mask_painter(np.array(image_input), input_mask)
     origin_image_input = image_input
     image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]))
     if ratio < 1.0:
         image_input = image_input.resize((int(width * ratio), int(height * ratio)))
         print('Scaling input image to {}'.format(image_input.size))
+    model = build_caption_anything_with_models(
+        args,
+        api_key="",
+        captioner=shared_captioner,
+        sam_model=shared_sam_model,
+        session_id=iface.app_id
+    )
     model.segmenter.set_image(image_input)
+    image_embedding = model.segmenter.image_embedding
+    original_size = model.segmenter.predictor.original_size
+    input_size = model.segmenter.predictor.input_size
+    return state, state, image_input, click_state, image_input, image_embedding, original_size, input_size
 with gr.Blocks(
     css='''
     state = gr.State([])
     click_state = gr.State([[],[],[]])
     origin_image = gr.State(None)
+    image_embedding = gr.State(None)
+    text_refiner = gr.State(None)
+    original_size = gr.State(None)
+    input_size = gr.State(None)
     gr.Markdown(title)
     gr.Markdown(description)
                 image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                 example_image = gr.Image(type="pil", interactive=False, visible=False)
                 with gr.Row(scale=1.0):
+                    with gr.Row(scale=0.4):
+                        point_prompt = gr.Radio(
+                            choices=["Positive",  "Negative"],
+                            value="Positive",
+                            label="Point Prompt",
+                            interactive=True)
+                        click_mode = gr.Radio(
+                            choices=["Continuous",  "Single"],
+                            value="Continuous",
+                            label="Clicking Mode",
+                            interactive=True)
+                    with gr.Row(scale=0.4):
+                        clear_button_clike = gr.Button(value="Clear Clicks", interactive=True)
+                        clear_button_image = gr.Button(value="Clear Image", interactive=True)
             with gr.Column(visible=False) as modules_need_gpt:
                 with gr.Row(scale=1.0):
                     language = gr.Dropdown(['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"], value="English", label="Language", interactive=True)
                     sentiment = gr.Radio(
                         choices=["Positive", "Natural", "Negative"],
                         value="Natural",
                         step=1,
                         interactive=True,
                         label="Length",
+                    )
+            with gr.Column(visible=True) as modules_not_need_gpt3:
+                gr.Examples(
+                    examples=examples,
+                    inputs=[example_image],
+                )
         with gr.Column(scale=0.5):
             openai_api_key = gr.Textbox(
+                placeholder="Input openAI API key",
                 show_label=False,
                 label = "OpenAI API Key",
                 lines=1,
+                type="password")
+            with gr.Row(scale=0.5):
+                enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
+                disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True, variant='primary')
             with gr.Column(visible=False) as modules_need_gpt2:
+                wiki_output = gr.Textbox(lines=5, label="Wiki", max_lines=5)
             with gr.Column(visible=False) as modules_not_need_gpt2:
+                chatbot = gr.Chatbot(label="Chat about Selected Object",).style(height=550,scale=0.5)
                 with gr.Column(visible=False) as modules_need_gpt3:
                     chat_input = gr.Textbox(lines=1, label="Chat Input")
                     with gr.Row():
                         clear_button_text = gr.Button(value="Clear Text", interactive=True)
                         submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
+    openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
+    enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
+    disable_chatGPT_button.click(init_openai_api_key, outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     clear_button_clike.click(
         lambda x: ([[], [], []], x, ""),
         [origin_image],
         show_progress=False
     )
+    image_input.upload(upload_callback,[image_input, state], [chatbot, state, origin_image, click_state, image_input, image_embedding, original_size, input_size])
+    chat_input.submit(chat_with_points, [chat_input, click_state, state, text_refiner], [chatbot, state])
+    example_image.change(upload_callback,[example_image, state], [state, state, origin_image, click_state, image_input, image_embedding, original_size, input_size])
     # select coordinate
     image_input.select(inference_seg_cap,
         inputs=[
         origin_image,
         point_prompt,
+        click_mode,
         language,
         sentiment,
         factuality,
         length,
+        image_embedding,
         state,
+        click_state,
+        original_size,
+        input_size,
+        text_refiner
         ],
         outputs=[chatbot, state, click_state, chat_input, image_input, wiki_output],
         show_progress=False, queue=True)
+iface.queue(concurrency_count=1, api_open=False, max_size=10)
+iface.launch(server_name="0.0.0.0", enable_queue=True, server_port=args.port, share=args.gradio_share)

caption_anything.py CHANGED Viewed

@@ -6,14 +6,17 @@ import argparse
 import pdb
 import time
 from PIL import Image
 class CaptionAnything():
-    def __init__(self, args, api_key=""):
         self.args = args
-        self.captioner = build_captioner(args.captioner, args.device, args)
-        self.segmenter = build_segmenter(args.segmenter, args.device, args)
         self.text_refiner = None
-        if not args.disable_gpt:
             self.init_refiner(api_key)
     def init_refiner(self, api_key):
@@ -22,19 +25,25 @@ class CaptionAnything():
             self.text_refiner.llm('hi') # test
         except:
             self.text_refiner = None
-            print('Openai api key is NOT given')
     def inference(self, image, prompt, controls, disable_gpt=False):
         #  segment with prompt
         print("CA prompt: ", prompt, "CA controls",controls)
         seg_mask = self.segmenter.inference(image, prompt)[0, ...]
         mask_save_path = f'result/mask_{time.time()}.png'
         if not os.path.exists(os.path.dirname(mask_save_path)):
             os.makedirs(os.path.dirname(mask_save_path))
-        new_p = Image.fromarray(seg_mask.astype('int') * 255.)
-        if new_p.mode != 'RGB':
-            new_p = new_p.convert('RGB')
-        new_p.save(mask_save_path)
         print('seg_mask path: ', mask_save_path)
         print("seg_mask.shape: ", seg_mask.shape)
         #  captioning with mask
@@ -53,6 +62,7 @@ class CaptionAnything():
         out = {'generated_captions': refined_caption,
             'crop_save_path': crop_save_path,
             'mask_save_path': mask_save_path,
             'context_captions': context_captions}
         return out
@@ -73,6 +83,7 @@ def parse_augment():
     parser.add_argument('--disable_gpt', action="store_true")
     parser.add_argument('--enable_reduce_tokens', action="store_true", default=False)
     parser.add_argument('--disable_reuse_features', action="store_true", default=False)
     args = parser.parse_args()
     if args.debug:
@@ -115,4 +126,4 @@ if __name__ == "__main__":
         print('Language controls:\n', controls)
         out = model.inference(image_path, prompt, controls)

 import pdb
 import time
 from PIL import Image
+import cv2
+import numpy as np
 class CaptionAnything():
+    def __init__(self, args, api_key="", captioner=None, segmenter=None, text_refiner=None):
         self.args = args
+        self.captioner = build_captioner(args.captioner, args.device, args) if captioner is None else captioner
+        self.segmenter = build_segmenter(args.segmenter, args.device, args) if segmenter is None else segmenter
         self.text_refiner = None
+        if not args.disable_gpt and text_refiner is not None:
             self.init_refiner(api_key)
     def init_refiner(self, api_key):
             self.text_refiner.llm('hi') # test
         except:
             self.text_refiner = None
+            print('OpenAI GPT is not available')
     def inference(self, image, prompt, controls, disable_gpt=False):
         #  segment with prompt
         print("CA prompt: ", prompt, "CA controls",controls)
         seg_mask = self.segmenter.inference(image, prompt)[0, ...]
+        if self.args.enable_morphologyex:
+            seg_mask = 255 * seg_mask.astype(np.uint8)
+            seg_mask = np.stack([seg_mask, seg_mask, seg_mask], axis = -1)
+            seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_OPEN, kernel = np.ones((6, 6), np.uint8))
+            seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_CLOSE, kernel = np.ones((6, 6), np.uint8))
+            seg_mask = seg_mask[:,:,0] > 0
         mask_save_path = f'result/mask_{time.time()}.png'
         if not os.path.exists(os.path.dirname(mask_save_path)):
             os.makedirs(os.path.dirname(mask_save_path))
+        seg_mask_img = Image.fromarray(seg_mask.astype('int') * 255.)
+        if seg_mask_img.mode != 'RGB':
+            seg_mask_img = seg_mask_img.convert('RGB')
+        seg_mask_img.save(mask_save_path)
         print('seg_mask path: ', mask_save_path)
         print("seg_mask.shape: ", seg_mask.shape)
         #  captioning with mask
         out = {'generated_captions': refined_caption,
             'crop_save_path': crop_save_path,
             'mask_save_path': mask_save_path,
+            'mask': seg_mask_img,
             'context_captions': context_captions}
         return out
     parser.add_argument('--disable_gpt', action="store_true")
     parser.add_argument('--enable_reduce_tokens', action="store_true", default=False)
     parser.add_argument('--disable_reuse_features', action="store_true", default=False)
+    parser.add_argument('--enable_morphologyex', action="store_true", default=False)
     args = parser.parse_args()
     if args.debug:
         print('Language controls:\n', controls)
         out = model.inference(image_path, prompt, controls)

segmenter/__init__.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from segmenter.base_segmenter import BaseSegmenter
-def build_segmenter(type, device, args=None):
     if type == 'base':
-        return BaseSegmenter(device, args.segmenter_checkpoint, reuse_feature=not args.disable_reuse_features)

 from segmenter.base_segmenter import BaseSegmenter
+def build_segmenter(type, device, args=None, model=None):
     if type == 'base':
+        return BaseSegmenter(device, args.segmenter_checkpoint, reuse_feature=not args.disable_reuse_features, model=model)
+    else:
+        raise NotImplementedError()

segmenter/base_segmenter.py CHANGED Viewed

@@ -9,15 +9,18 @@ import matplotlib.pyplot as plt
 import PIL
 class BaseSegmenter:
-    def __init__(self, device, checkpoint, model_type='vit_h', reuse_feature = True):
         print(f"Initializing BaseSegmenter to {device}")
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = None
         self.model_type = model_type
-        self.checkpoint = checkpoint
-        self.model = sam_model_registry[self.model_type](checkpoint=self.checkpoint)
-        self.model.to(device=self.device)
         self.reuse_feature = reuse_feature
         self.predictor = SamPredictor(self.model)
         self.mask_generator = SamAutomaticMaskGenerator(self.model)

 import PIL
 class BaseSegmenter:
+    def __init__(self, device, checkpoint, model_type='vit_h', reuse_feature = True, model=None):
         print(f"Initializing BaseSegmenter to {device}")
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = None
         self.model_type = model_type
+        if model is None:
+            self.checkpoint = checkpoint
+            self.model = sam_model_registry[self.model_type](checkpoint=self.checkpoint)
+            self.model.to(device=self.device)
+        else:
+            self.model = model
         self.reuse_feature = reuse_feature
         self.predictor = SamPredictor(self.model)
         self.mask_generator = SamAutomaticMaskGenerator(self.model)