Spaces:

onuralpszr
/

paligemma2-detection

Running on Zero

App Files Files Community

onuralpszr commited on 10 days ago

Commit

50942e0

•

1 Parent(s): 1be4b11

feat: ✨ token slider added

Browse files

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>

Files changed (1) hide show

app.py +21 -39

app.py CHANGED Viewed

@@ -21,14 +21,14 @@ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DE
 processor = PaliGemmaProcessor.from_pretrained(model_id)
 @spaces.GPU
-def paligemma_detection(input_image, input_text):
     model_inputs = processor(text=input_text,
                              images=input_image,
                              return_tensors="pt"
                              ).to(torch.bfloat16).to(model.device)
     input_len = model_inputs["input_ids"].shape[-1]
     with torch.inference_mode():
-        generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
         generation = generation[0][input_len:]
         result = processor.decode(generation, skip_special_tokens=True)
     return result
@@ -61,9 +61,9 @@ def annotate_image(result, resolution_wh, class_names, cv_image):
     return annotated_image
-def process_image(input_image,input_text,class_names):
     cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
-    result = paligemma_detection(input_image, input_text)
     annotated_image = annotate_image(result,
                                      (input_image.width, input_image.height),
                                      class_names, cv_image)
@@ -71,7 +71,7 @@ def process_image(input_image,input_text,class_names):
 @spaces.GPU
-def process_video(input_video, input_text, class_names, progress=gr.Progress(track_tqdm=True)):
     if not input_video:
         gr.Info("Please upload a video.")
         return None
@@ -100,7 +100,7 @@ def process_video(input_video, input_text, class_names, progress=gr.Progress(tra
             input_len = model_inputs["input_ids"].shape[-1]
             with torch.inference_mode():
-                generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
                 generation = generation[0][input_len:]
                 result = processor.decode(generation, skip_special_tokens=True)
@@ -134,39 +134,19 @@ def process_video(input_video, input_text, class_names, progress=gr.Progress(tra
     return video_path, results
 with gr.Blocks() as app:
-    gr.Markdown( """
-    ## PaliGemma 2 Detection with Supervision - Demo
-    <br>
-    <div style="display: flex; gap: 10px;">
-    <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
-        <img src="https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white" alt="Github">
-    </a>
-    <a href="https://huggingface.co/blog/paligemma">
-        <img src="https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black" alt="Huggingface">
-    </a>
-    <a href="https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb">
-        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab">
-    </a>
-    <a href="https://arxiv.org/abs/2412.03555">
-        <img src="https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white" alt="Paper">
-    </a>
-    <a href="https://supervision.roboflow.com/">
-        <img src="https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white" alt="Supervision">
-    </a>
-    </div>
-    <br>
-    PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
-    built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
-    vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
-    model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
-    answering, text reading, object detection and object segmentation.
-    This space show how to use PaliGemma 2 for object detection with supervision.
-    You can input an image and a text prompt
     """)
     with gr.Tab("Image Detection"):
@@ -175,12 +155,13 @@ with gr.Blocks() as app:
                 input_image = gr.Image(type="pil", label="Input Image")
                 input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
                 class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
             with gr.Column():
                 annotated_image = gr.Image(type="pil", label="Annotated Image")
                 detection_result = gr.Textbox(label="Detection Result")
         gr.Button("Submit").click(
             fn=process_image,
-            inputs=[input_image, input_text, class_names],
             outputs=[annotated_image, detection_result]
         )
@@ -190,13 +171,14 @@ with gr.Blocks() as app:
                 input_video = gr.Video(label="Input Video")
                 input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
                 class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
             with gr.Column():
                 output_video = gr.Video(label="Annotated Video")
                 detection_result = gr.Textbox(label="Detection Result")
         gr.Button("Process Video").click(
             fn=process_video,
-            inputs=[input_video, input_text, class_names],
             outputs=[output_video, detection_result]
         )

 processor = PaliGemmaProcessor.from_pretrained(model_id)
 @spaces.GPU
+def paligemma_detection(input_image, input_text, max_new_tokens):
     model_inputs = processor(text=input_text,
                              images=input_image,
                              return_tensors="pt"
                              ).to(torch.bfloat16).to(model.device)
     input_len = model_inputs["input_ids"].shape[-1]
     with torch.inference_mode():
+        generation = model.generate(**model_inputs, max_new_tokens=max_new_tokens, do_sample=False)
         generation = generation[0][input_len:]
         result = processor.decode(generation, skip_special_tokens=True)
     return result
     return annotated_image
+def process_image(input_image, input_text, class_names, max_new_tokens):
     cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
+    result = paligemma_detection(input_image, input_text, max_new_tokens)
     annotated_image = annotate_image(result,
                                      (input_image.width, input_image.height),
                                      class_names, cv_image)
 @spaces.GPU
+def process_video(input_video, input_text, class_names, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
     if not input_video:
         gr.Info("Please upload a video.")
         return None
             input_len = model_inputs["input_ids"].shape[-1]
             with torch.inference_mode():
+                generation = model.generate(**model_inputs, max_new_tokens=max_new_tokens, do_sample=False)
                 generation = generation[0][input_len:]
                 result = processor.decode(generation, skip_special_tokens=True)
     return video_path, results
 with gr.Blocks() as app:
+    gr.Markdown("""
+## PaliGemma 2 Detection with Supervision - Demo
+[![Github](https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white)](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) [![Huggingface](https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black)](https://huggingface.co/blog/paligemma) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb) [![Paper](https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white)](https://arxiv.org/abs/2412.03555) [![Supervision](https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white)](https://supervision.roboflow.com/)
+PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
+built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
+vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
+model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
+answering, text reading, object detection and object segmentation.
+This space show how to use PaliGemma 2 for object detection with supervision.
+You can input an image and a text prompt
     """)
     with gr.Tab("Image Detection"):
                 input_image = gr.Image(type="pil", label="Input Image")
                 input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
                 class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
+                max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
             with gr.Column():
                 annotated_image = gr.Image(type="pil", label="Annotated Image")
                 detection_result = gr.Textbox(label="Detection Result")
         gr.Button("Submit").click(
             fn=process_image,
+            inputs=[input_image, input_text, class_names, max_new_tokens],
             outputs=[annotated_image, detection_result]
         )
                 input_video = gr.Video(label="Input Video")
                 input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
                 class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
+                max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")
             with gr.Column():
                 output_video = gr.Video(label="Annotated Video")
                 detection_result = gr.Textbox(label="Detection Result")
         gr.Button("Process Video").click(
             fn=process_video,
+            inputs=[input_video, input_text, class_names, max_new_tokens],
             outputs=[output_video, detection_result]
         )