Spaces:

onuralpszr
/

paligemma2-detection

Running on Zero

App Files Files Community

onuralpszr commited on Dec 5, 2024

Commit

49d986a

verified ·

1 Parent(s): 990acce

docs: 📝 better description for intro text

Browse files

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>

Files changed (2) hide show

app.py +43 -14
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import os
-import PIL.Image
-import transformers
 from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 import torch
 import supervision as sv
@@ -20,6 +17,7 @@ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DE
 processor = PaliGemmaProcessor.from_pretrained(model_id)
 @spaces.GPU
 def process_image(input_image,input_text,class_names):
     class_list = class_names.split(',')
@@ -57,20 +55,51 @@ def process_image(input_image,input_text,class_names):
     return annotated_image, result
-app = gr.Interface(
-    fn=process_image,
-    inputs=[
-        gr.Image(type="pil", label="Input Image"),
-        gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog"),
-        gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
-    ],
-    outputs=[gr.Image(type="pil", label="Annotated Image"), gr.Textbox(label="Detection Result")],
-    title="PaliGemma2 Image Detection with Supervision",
-    description="Detect objects in an image using PaliGemma2 model."
-)
 if __name__ == "__main__":
     app.launch()

 from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 import torch
 import supervision as sv
 processor = PaliGemmaProcessor.from_pretrained(model_id)
 @spaces.GPU
 def process_image(input_image,input_text,class_names):
     class_list = class_names.split(',')
     return annotated_image, result
+with gr.Blocks() as app:
+    gr.Markdown( """
+    ## PaliGemma 2 Detection with Supervision - Demo \n\n
+    <div style="display: flex; gap: 10px;">
+    <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
+        <img src="https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white" alt="Github">
+    </a>
+    <a href="https://huggingface.co/blog/paligemma">
+        <img src="https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black" alt="Huggingface">
+    </a>
+    <a href="https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab">
+    </a>
+    <a href="https://arxiv.org/abs/2412.03555">
+        <img src="https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white" alt="Paper">
+    </a>
+    <a href="https://supervision.roboflow.com/">
+        <img src="https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white" alt="Supervision">
+    </a>
+    </div>
+    \n\n
+    PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
+    built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
+    vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
+    model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
+    answering, text reading, object detection and object segmentation.
+    This space show how to use PaliGemma 2 for object detection with supervision.
+    You can input an image and a text prompt
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Input Image")
+            input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
+            class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
+        with gr.Column():
+            annotated_image = gr.Image(type="pil", label="Annotated Image")
+            detection_result = gr.Textbox(label="Detection Result")
+    gr.Button("Submit").click(
+        fn=process_image,
+        inputs=[input_image, input_text, class_names],
+        outputs=[annotated_image, detection_result]
+    )
 if __name__ == "__main__":
     app.launch()

requirements.txt CHANGED Viewed

@@ -2,6 +2,5 @@ supervision
 transformers==4.47.0
 requests
 tqdm
-gradio
 spaces
 torch

 transformers==4.47.0
 requests
 tqdm
 spaces
 torch