onuralpszr commited on
Commit
1be4b11
1 Parent(s): 49d986a

feat: ✨ video detection tab added

Browse files

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>

Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +126 -27
  3. helpers/__init__.py +0 -0
  4. helpers/utils.py +25 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -6,37 +6,42 @@ import numpy as np
6
  from PIL import Image
7
  import gradio as gr
8
  import spaces
 
 
9
 
10
  BOX_ANNOTATOR = sv.BoxAnnotator()
11
  LABEL_ANNOTATOR = sv.LabelAnnotator()
12
  MASK_ANNOTATOR = sv.MaskAnnotator()
13
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
14
 
 
15
  model_id = "google/paligemma2-3b-pt-448"
16
  model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
17
  processor = PaliGemmaProcessor.from_pretrained(model_id)
18
 
19
-
20
-
21
  @spaces.GPU
22
- def process_image(input_image,input_text,class_names):
23
- class_list = class_names.split(',')
24
- cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
25
- model_inputs = processor(text=input_text, images=input_image, return_tensors="pt").to(torch.bfloat16).to(model.device)
 
26
  input_len = model_inputs["input_ids"].shape[-1]
27
-
28
  with torch.inference_mode():
29
  generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
30
  generation = generation[0][input_len:]
31
  result = processor.decode(generation, skip_special_tokens=True)
 
32
 
 
 
33
  detections = sv.Detections.from_lmm(
34
- sv.LMM.PALIGEMMA,
35
- result,
36
- resolution_wh=(input_image.width, input_image.height),
37
- classes=class_list
38
  )
39
-
40
  annotated_image = BOX_ANNOTATOR.annotate(
41
  scene=cv_image.copy(),
42
  detections=detections
@@ -52,12 +57,87 @@ def process_image(input_image,input_text,class_names):
52
 
53
  annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
54
  annotated_image = Image.fromarray(annotated_image)
 
 
 
55
 
 
 
 
 
 
 
56
  return annotated_image, result
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  with gr.Blocks() as app:
59
  gr.Markdown( """
60
- ## PaliGemma 2 Detection with Supervision - Demo \n\n
 
 
61
 
62
  <div style="display: flex; gap: 10px;">
63
  <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
@@ -76,8 +156,9 @@ with gr.Blocks() as app:
76
  <img src="https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white" alt="Supervision">
77
  </a>
78
  </div>
 
 
79
 
80
- \n\n
81
  PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
82
  built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
83
  vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
@@ -87,19 +168,37 @@ with gr.Blocks() as app:
87
  This space show how to use PaliGemma 2 for object detection with supervision.
88
  You can input an image and a text prompt
89
  """)
90
- with gr.Row():
91
- with gr.Column():
92
- input_image = gr.Image(type="pil", label="Input Image")
93
- input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
94
- class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
95
- with gr.Column():
96
- annotated_image = gr.Image(type="pil", label="Annotated Image")
97
- detection_result = gr.Textbox(label="Detection Result")
98
- gr.Button("Submit").click(
99
- fn=process_image,
100
- inputs=[input_image, input_text, class_names],
101
- outputs=[annotated_image, detection_result]
102
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  if __name__ == "__main__":
105
  app.launch()
 
6
  from PIL import Image
7
  import gradio as gr
8
  import spaces
9
+ from helpers.utils import create_directory, delete_directory, generate_unique_name
10
+ import os
11
 
12
  BOX_ANNOTATOR = sv.BoxAnnotator()
13
  LABEL_ANNOTATOR = sv.LabelAnnotator()
14
  MASK_ANNOTATOR = sv.MaskAnnotator()
15
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ VIDEO_TARGET_DIRECTORY = "tmp"
17
 
18
+ create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
19
  model_id = "google/paligemma2-3b-pt-448"
20
  model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
21
  processor = PaliGemmaProcessor.from_pretrained(model_id)
22
 
 
 
23
  @spaces.GPU
24
+ def paligemma_detection(input_image, input_text):
25
+ model_inputs = processor(text=input_text,
26
+ images=input_image,
27
+ return_tensors="pt"
28
+ ).to(torch.bfloat16).to(model.device)
29
  input_len = model_inputs["input_ids"].shape[-1]
 
30
  with torch.inference_mode():
31
  generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
32
  generation = generation[0][input_len:]
33
  result = processor.decode(generation, skip_special_tokens=True)
34
+ return result
35
 
36
+ def annotate_image(result, resolution_wh, class_names, cv_image):
37
+
38
  detections = sv.Detections.from_lmm(
39
+ sv.LMM.PALIGEMMA,
40
+ result,
41
+ resolution_wh=resolution_wh,
42
+ classes=class_names.split(',')
43
  )
44
+
45
  annotated_image = BOX_ANNOTATOR.annotate(
46
  scene=cv_image.copy(),
47
  detections=detections
 
57
 
58
  annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
59
  annotated_image = Image.fromarray(annotated_image)
60
+
61
+ return annotated_image
62
+
63
 
64
+ def process_image(input_image,input_text,class_names):
65
+ cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
66
+ result = paligemma_detection(input_image, input_text)
67
+ annotated_image = annotate_image(result,
68
+ (input_image.width, input_image.height),
69
+ class_names, cv_image)
70
  return annotated_image, result
71
 
72
+
73
+ @spaces.GPU
74
+ def process_video(input_video, input_text, class_names, progress=gr.Progress(track_tqdm=True)):
75
+ if not input_video:
76
+ gr.Info("Please upload a video.")
77
+ return None
78
+
79
+ if not input_text:
80
+ gr.Info("Please enter a text prompt.")
81
+ return None
82
+
83
+ name = generate_unique_name()
84
+ frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
85
+ create_directory(frame_directory_path)
86
+
87
+ video_info = sv.VideoInfo.from_video_path(input_video)
88
+ frame_generator = sv.get_video_frames_generator(input_video)
89
+ video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
90
+ results = []
91
+ with sv.VideoSink(video_path, video_info=video_info) as sink:
92
+ for frame in progress.tqdm(frame_generator, desc="Processing video"):
93
+ pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
94
+
95
+ model_inputs = processor(
96
+ text=input_text,
97
+ images=pil_frame,
98
+ return_tensors="pt"
99
+ ).to(torch.bfloat16).to(model.device)
100
+ input_len = model_inputs["input_ids"].shape[-1]
101
+
102
+ with torch.inference_mode():
103
+ generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
104
+ generation = generation[0][input_len:]
105
+ result = processor.decode(generation, skip_special_tokens=True)
106
+
107
+
108
+ detections = sv.Detections.from_lmm(
109
+ sv.LMM.PALIGEMMA,
110
+ result,
111
+ resolution_wh=(video_info.width, video_info.height),
112
+ classes=class_names.split(',')
113
+ )
114
+
115
+ annotated_frame = BOX_ANNOTATOR.annotate(
116
+ scene=frame.copy(),
117
+ detections=detections
118
+ )
119
+ annotated_frame = LABEL_ANNOTATOR.annotate(
120
+ scene=annotated_frame,
121
+ detections=detections
122
+ )
123
+ annotated_frame = MASK_ANNOTATOR.annotate(
124
+ scene=annotated_frame,
125
+ detections=detections
126
+ )
127
+
128
+
129
+ results.append(result)
130
+
131
+ sink.write_frame(annotated_frame)
132
+
133
+ delete_directory(frame_directory_path)
134
+ return video_path, results
135
+
136
  with gr.Blocks() as app:
137
  gr.Markdown( """
138
+ ## PaliGemma 2 Detection with Supervision - Demo
139
+
140
+ <br>
141
 
142
  <div style="display: flex; gap: 10px;">
143
  <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
 
156
  <img src="https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white" alt="Supervision">
157
  </a>
158
  </div>
159
+
160
+ <br>
161
 
 
162
  PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
163
  built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
164
  vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
 
168
  This space show how to use PaliGemma 2 for object detection with supervision.
169
  You can input an image and a text prompt
170
  """)
171
+
172
+ with gr.Tab("Image Detection"):
173
+ with gr.Row():
174
+ with gr.Column():
175
+ input_image = gr.Image(type="pil", label="Input Image")
176
+ input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
177
+ class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
178
+ with gr.Column():
179
+ annotated_image = gr.Image(type="pil", label="Annotated Image")
180
+ detection_result = gr.Textbox(label="Detection Result")
181
+ gr.Button("Submit").click(
182
+ fn=process_image,
183
+ inputs=[input_image, input_text, class_names],
184
+ outputs=[annotated_image, detection_result]
185
+ )
186
+
187
+ with gr.Tab("Video Detection"):
188
+ with gr.Row():
189
+ with gr.Column():
190
+ input_video = gr.Video(label="Input Video")
191
+ input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
192
+ class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
193
+ with gr.Column():
194
+ output_video = gr.Video(label="Annotated Video")
195
+ detection_result = gr.Textbox(label="Detection Result")
196
+
197
+ gr.Button("Process Video").click(
198
+ fn=process_video,
199
+ inputs=[input_video, input_text, class_names],
200
+ outputs=[output_video, detection_result]
201
+ )
202
 
203
  if __name__ == "__main__":
204
  app.launch()
helpers/__init__.py ADDED
File without changes
helpers/utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ import shutil
4
+ import uuid
5
+
6
+ def create_directory(directory_path: str) -> None:
7
+ if not os.path.exists(directory_path):
8
+ os.makedirs(directory_path)
9
+
10
+
11
+ def delete_directory(directory_path: str) -> None:
12
+ if not os.path.exists(directory_path):
13
+ raise FileNotFoundError(f"Directory '{directory_path}' does not exist.")
14
+
15
+ try:
16
+ shutil.rmtree(directory_path)
17
+ except PermissionError:
18
+ raise PermissionError(
19
+ f"Permission denied: Unable to delete '{directory_path}'.")
20
+
21
+
22
+ def generate_unique_name():
23
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
24
+ unique_id = uuid.uuid4()
25
+ return f"{current_datetime}_{unique_id}"