onuralpszr commited on
Commit
50942e0
1 Parent(s): 1be4b11

feat: ✨ token slider added

Browse files

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>

Files changed (1) hide show
  1. app.py +21 -39
app.py CHANGED
@@ -21,14 +21,14 @@ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DE
21
  processor = PaliGemmaProcessor.from_pretrained(model_id)
22
 
23
  @spaces.GPU
24
- def paligemma_detection(input_image, input_text):
25
  model_inputs = processor(text=input_text,
26
  images=input_image,
27
  return_tensors="pt"
28
  ).to(torch.bfloat16).to(model.device)
29
  input_len = model_inputs["input_ids"].shape[-1]
30
  with torch.inference_mode():
31
- generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
32
  generation = generation[0][input_len:]
33
  result = processor.decode(generation, skip_special_tokens=True)
34
  return result
@@ -61,9 +61,9 @@ def annotate_image(result, resolution_wh, class_names, cv_image):
61
  return annotated_image
62
 
63
 
64
- def process_image(input_image,input_text,class_names):
65
  cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
66
- result = paligemma_detection(input_image, input_text)
67
  annotated_image = annotate_image(result,
68
  (input_image.width, input_image.height),
69
  class_names, cv_image)
@@ -71,7 +71,7 @@ def process_image(input_image,input_text,class_names):
71
 
72
 
73
  @spaces.GPU
74
- def process_video(input_video, input_text, class_names, progress=gr.Progress(track_tqdm=True)):
75
  if not input_video:
76
  gr.Info("Please upload a video.")
77
  return None
@@ -100,7 +100,7 @@ def process_video(input_video, input_text, class_names, progress=gr.Progress(tra
100
  input_len = model_inputs["input_ids"].shape[-1]
101
 
102
  with torch.inference_mode():
103
- generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
104
  generation = generation[0][input_len:]
105
  result = processor.decode(generation, skip_special_tokens=True)
106
 
@@ -134,39 +134,19 @@ def process_video(input_video, input_text, class_names, progress=gr.Progress(tra
134
  return video_path, results
135
 
136
  with gr.Blocks() as app:
137
- gr.Markdown( """
138
- ## PaliGemma 2 Detection with Supervision - Demo
139
-
140
- <br>
141
-
142
- <div style="display: flex; gap: 10px;">
143
- <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
144
- <img src="https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white" alt="Github">
145
- </a>
146
- <a href="https://huggingface.co/blog/paligemma">
147
- <img src="https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black" alt="Huggingface">
148
- </a>
149
- <a href="https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb">
150
- <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab">
151
- </a>
152
- <a href="https://arxiv.org/abs/2412.03555">
153
- <img src="https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white" alt="Paper">
154
- </a>
155
- <a href="https://supervision.roboflow.com/">
156
- <img src="https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white" alt="Supervision">
157
- </a>
158
- </div>
159
-
160
- <br>
161
 
162
- PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
163
- built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
164
- vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
165
- model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
166
- answering, text reading, object detection and object segmentation.
167
 
168
- This space show how to use PaliGemma 2 for object detection with supervision.
169
- You can input an image and a text prompt
170
  """)
171
 
172
  with gr.Tab("Image Detection"):
@@ -175,12 +155,13 @@ with gr.Blocks() as app:
175
  input_image = gr.Image(type="pil", label="Input Image")
176
  input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
177
  class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
 
178
  with gr.Column():
179
  annotated_image = gr.Image(type="pil", label="Annotated Image")
180
  detection_result = gr.Textbox(label="Detection Result")
181
  gr.Button("Submit").click(
182
  fn=process_image,
183
- inputs=[input_image, input_text, class_names],
184
  outputs=[annotated_image, detection_result]
185
  )
186
 
@@ -190,13 +171,14 @@ with gr.Blocks() as app:
190
  input_video = gr.Video(label="Input Video")
191
  input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
192
  class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
 
193
  with gr.Column():
194
  output_video = gr.Video(label="Annotated Video")
195
  detection_result = gr.Textbox(label="Detection Result")
196
 
197
  gr.Button("Process Video").click(
198
  fn=process_video,
199
- inputs=[input_video, input_text, class_names],
200
  outputs=[output_video, detection_result]
201
  )
202
 
 
21
  processor = PaliGemmaProcessor.from_pretrained(model_id)
22
 
23
  @spaces.GPU
24
+ def paligemma_detection(input_image, input_text, max_new_tokens):
25
  model_inputs = processor(text=input_text,
26
  images=input_image,
27
  return_tensors="pt"
28
  ).to(torch.bfloat16).to(model.device)
29
  input_len = model_inputs["input_ids"].shape[-1]
30
  with torch.inference_mode():
31
+ generation = model.generate(**model_inputs, max_new_tokens=max_new_tokens, do_sample=False)
32
  generation = generation[0][input_len:]
33
  result = processor.decode(generation, skip_special_tokens=True)
34
  return result
 
61
  return annotated_image
62
 
63
 
64
+ def process_image(input_image, input_text, class_names, max_new_tokens):
65
  cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
66
+ result = paligemma_detection(input_image, input_text, max_new_tokens)
67
  annotated_image = annotate_image(result,
68
  (input_image.width, input_image.height),
69
  class_names, cv_image)
 
71
 
72
 
73
  @spaces.GPU
74
+ def process_video(input_video, input_text, class_names, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
75
  if not input_video:
76
  gr.Info("Please upload a video.")
77
  return None
 
100
  input_len = model_inputs["input_ids"].shape[-1]
101
 
102
  with torch.inference_mode():
103
+ generation = model.generate(**model_inputs, max_new_tokens=max_new_tokens, do_sample=False)
104
  generation = generation[0][input_len:]
105
  result = processor.decode(generation, skip_special_tokens=True)
106
 
 
134
  return video_path, results
135
 
136
  with gr.Blocks() as app:
137
+ gr.Markdown("""
138
+ ## PaliGemma 2 Detection with Supervision - Demo
139
+
140
+ [![Github](https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white)](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) [![Huggingface](https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black)](https://huggingface.co/blog/paligemma) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb) [![Paper](https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white)](https://arxiv.org/abs/2412.03555) [![Supervision](https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white)](https://supervision.roboflow.com/)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
143
+ built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
144
+ vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
145
+ model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
146
+ answering, text reading, object detection and object segmentation.
147
 
148
+ This space show how to use PaliGemma 2 for object detection with supervision.
149
+ You can input an image and a text prompt
150
  """)
151
 
152
  with gr.Tab("Image Detection"):
 
155
  input_image = gr.Image(type="pil", label="Input Image")
156
  input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
157
  class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
158
+ max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
159
  with gr.Column():
160
  annotated_image = gr.Image(type="pil", label="Annotated Image")
161
  detection_result = gr.Textbox(label="Detection Result")
162
  gr.Button("Submit").click(
163
  fn=process_image,
164
+ inputs=[input_image, input_text, class_names, max_new_tokens],
165
  outputs=[annotated_image, detection_result]
166
  )
167
 
 
171
  input_video = gr.Video(label="Input Video")
172
  input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
173
  class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
174
+ max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")
175
  with gr.Column():
176
  output_video = gr.Video(label="Annotated Video")
177
  detection_result = gr.Textbox(label="Detection Result")
178
 
179
  gr.Button("Process Video").click(
180
  fn=process_video,
181
+ inputs=[input_video, input_text, class_names, max_new_tokens],
182
  outputs=[output_video, detection_result]
183
  )
184