Spaces:

justinj92
/

florence-2

Running on Zero

App Files Files Community

justinj92 commited on Jan 14

Commit

51a75e5

verified ·

1 Parent(s): c6a0eef

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -170

app.py CHANGED Viewed

@@ -1,171 +1,171 @@
-from typing import Tuple, Optional
-import gradio as gr
-import spaces
-import supervision as sv
-import torch
-from PIL import Image
-from gradio_image_prompter import ImagePrompter
-from utils.annotate import annotate_with_boxes
-from utils.models import load_models, run_inference, CHECKPOINTS, \
-    pre_process_region_task_input, post_process_region_output
-from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
-    CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
-    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
-    IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
-    TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
-    IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
-    DENSE_REGION_CAPTION_TASK_NAME
-MARKDOWN = """
-# Florence-2 🔥
-Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
-MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
-across tasks such as captioning, object detection, grounding, and segmentation.
-The model takes images and task prompts as input, generating the desired results in
-text format. It uses a DaViT vision encoder to convert images into visual token
-embeddings. These are then concatenated with BERT-generated text embeddings and
-processed by a transformer-based multi-modal encoder-decoder to generate the response.
-"""
-EXAMPLES = [
-    ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
-    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
-]
-# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-DEVICE = "cuda"
-MODELS, PROCESSORS = load_models(DEVICE)
-@spaces.GPU
-def process(
-    checkpoint_dropdown,
-    task_dropdown,
-    image_input,
-    image_prompter_input
-) -> Tuple[Optional[Image.Image], Optional[str]]:
-    model = MODELS[checkpoint_dropdown]
-    processor = PROCESSORS[checkpoint_dropdown]
-    task = TASKS[task_dropdown]
-    if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
-        _, response = run_inference(
-            model, processor, DEVICE, image_input, task)
-        detections = sv.Detections.from_lmm(
-            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
-        return annotate_with_boxes(image_input, detections), None
-    elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
-        _, response = run_inference(
-            model, processor, DEVICE, image_input, task)
-        return None, response[task]
-    elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
-        detections_list = []
-        print(image_prompter_input)
-        image_input = image_prompter_input["image"]
-        for prompt in image_prompter_input["points"]:
-            text = pre_process_region_task_input(
-                prompt=prompt,
-                resolution_wh=image_input.size
-            )
-            _, response = run_inference(
-                model, processor, DEVICE, image_input, task, text)
-            detections = sv.Detections.from_lmm(
-                lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
-            detections_list.append(detections)
-        detections = sv.Detections.merge(detections_list=detections_list)
-        detections = post_process_region_output(
-            detections=detections, resolution_wh=image_input.size)
-        return annotate_with_boxes(image_input, detections), None
-with gr.Blocks() as demo:
-    gr.Markdown(MARKDOWN)
-    with gr.Row():
-        checkpoint_dropdown_component = gr.Dropdown(
-            choices=CHECKPOINTS,
-            value=CHECKPOINTS[0],
-            label="Model", info="Select a Florence 2 model to use.",
-            interactive=True
-        )
-        task_dropdown_component = gr.Dropdown(
-            choices=TASK_NAMES,
-            value=TASK_NAMES[0],
-            label="Task", info="Select a task to perform with the model.",
-            interactive=True
-        )
-    with gr.Row():
-        with gr.Column():
-            image_input_component = gr.Image(
-                type='pil', label='Upload image')
-            image_prompter_input_component = ImagePrompter(
-                type='pil', label='Image prompt', visible=False)
-            submit_button_component = gr.Button(value='Submit', variant='primary')
-        with gr.Column():
-            image_output_component = gr.Image(type='pil', label='Image Output')
-            text_output_component = gr.Textbox(label='Caption Output', visible=False)
-    with gr.Row():
-        gr.Examples(
-            fn=process,
-            examples=EXAMPLES,
-            inputs=[
-                checkpoint_dropdown_component,
-                task_dropdown_component,
-                image_input_component,
-                image_prompter_input_component
-            ],
-            outputs=[
-                image_output_component,
-                text_output_component
-            ],
-            run_on_click=True
-        )
-    def on_dropdown_change(text):
-        return [
-            gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
-            ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
-            gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
-            gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
-        ]
-    task_dropdown_component.change(
-        on_dropdown_change,
-        inputs=[task_dropdown_component],
-        outputs=[
-            image_input_component,
-            image_prompter_input_component,
-            image_output_component,
-            text_output_component
-        ]
-    )
-    submit_button_component.click(
-        fn=process,
-        inputs=[
-            checkpoint_dropdown_component,
-            task_dropdown_component,
-            image_input_component,
-            image_prompter_input_component
-        ],
-        outputs=[
-            image_output_component,
-            text_output_component
-        ]
-    )
 demo.launch(debug=False, show_error=True, max_threads=1)

+from typing import Tuple, Optional
+import gradio as gr
+import spaces
+import supervision as sv
+import torch
+from PIL import Image
+from gradio_image_prompter import ImagePrompter
+from utils.annotate import annotate_with_boxes
+from utils.models import load_models, run_inference, CHECKPOINTS, \
+    pre_process_region_task_input, post_process_region_output
+from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
+    CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
+    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
+    IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
+    TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
+    IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
+    DENSE_REGION_CAPTION_TASK_NAME
+MARKDOWN = """
+# Florence-2 🔥
+Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
+MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
+across tasks such as captioning, object detection, grounding, and segmentation.
+The model takes images and task prompts as input, generating the desired results in
+text format. It uses a DaViT vision encoder to convert images into visual token
+embeddings. These are then concatenated with BERT-generated text embeddings and
+processed by a transformer-based multi-modal encoder-decoder to generate the response.
+"""
+EXAMPLES = [
+    ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
+    ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+]
+# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DEVICE = "cuda"
+MODELS, PROCESSORS = load_models(DEVICE)
+@spaces.GPU
+def process(
+    checkpoint_dropdown,
+    task_dropdown,
+    image_input,
+    image_prompter_input
+) -> Tuple[Optional[Image.Image], Optional[str]]:
+    model = MODELS[checkpoint_dropdown]
+    processor = PROCESSORS[checkpoint_dropdown]
+    task = TASKS[task_dropdown]
+    if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
+        _, response = run_inference(
+            model, processor, DEVICE, image_input, task)
+        detections = sv.Detections.from_lmm(
+            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
+        return annotate_with_boxes(image_input, detections), None
+    elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
+        _, response = run_inference(
+            model, processor, DEVICE, image_input, task)
+        return None, response[task]
+    elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
+        detections_list = []
+        print(image_prompter_input)
+        image_input = image_prompter_input["image"]
+        for prompt in image_prompter_input["points"]:
+            text = pre_process_region_task_input(
+                prompt=prompt,
+                resolution_wh=image_input.size
+            )
+            _, response = run_inference(
+                model, processor, DEVICE, image_input, task, text)
+            detections = sv.Detections.from_lmm(
+                lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
+            detections_list.append(detections)
+        detections = sv.Detections.merge(detections_list=detections_list)
+        detections = post_process_region_output(
+            detections=detections, resolution_wh=image_input.size)
+        return annotate_with_boxes(image_input, detections), None
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        checkpoint_dropdown_component = gr.Dropdown(
+            choices=CHECKPOINTS,
+            value=CHECKPOINTS[0],
+            label="Model", info="Select a Florence 2 model to use.",
+            interactive=True
+        )
+        task_dropdown_component = gr.Dropdown(
+            choices=TASK_NAMES,
+            value=TASK_NAMES[0],
+            label="Task", info="Select a task to perform with the model.",
+            interactive=True
+        )
+    with gr.Row():
+        with gr.Column():
+            image_input_component = gr.Image(
+                type='pil', label='Upload image')
+            image_prompter_input_component = ImagePrompter(
+                type='pil', label='Image prompt', visible=False)
+            submit_button_component = gr.Button(value='Submit', variant='primary')
+        with gr.Column():
+            image_output_component = gr.Image(type='pil', label='Image Output')
+            text_output_component = gr.Textbox(label='Caption Output', visible=False)
+    with gr.Row():
+        gr.Examples(
+            fn=process,
+            examples=EXAMPLES,
+            inputs=[
+                checkpoint_dropdown_component,
+                task_dropdown_component,
+                image_input_component,
+                image_prompter_input_component
+            ],
+            outputs=[
+                image_output_component,
+                text_output_component
+            ],
+            run_on_click=True
+        )
+    def on_dropdown_change(text):
+        return [
+            gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
+            ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
+            gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
+            gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
+        ]
+    task_dropdown_component.change(
+        on_dropdown_change,
+        inputs=[task_dropdown_component],
+        outputs=[
+            image_input_component,
+            image_prompter_input_component,
+            image_output_component,
+            text_output_component
+        ]
+    )
+    submit_button_component.click(
+        fn=process,
+        inputs=[
+            checkpoint_dropdown_component,
+            task_dropdown_component,
+            image_input_component,
+            image_prompter_input_component
+        ],
+        outputs=[
+            image_output_component,
+            text_output_component
+        ]
+    )
 demo.launch(debug=False, show_error=True, max_threads=1)