Spaces:

onuralpszr
/

paligemma2-detection

Running on Zero

App Files Files Community

onuralpszr commited on Dec 11, 2024

Commit

5645efe

verified ·

1 Parent(s): d552355

feat: ✨ For segmentation methods are added

Browse files

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>

Files changed (5) hide show

.gitignore +168 -0
app.py +68 -9
helpers/{utils.py → file_utils.py} +0 -0
helpers/segment_utils.py +190 -0
requirements.txt +3 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py CHANGED Viewed

@@ -6,7 +6,8 @@ import numpy as np
 from PIL import Image
 import gradio as gr
 import spaces
-from helpers.utils import create_directory, delete_directory, generate_unique_name
 import os
 BOX_ANNOTATOR = sv.BoxAnnotator()
@@ -14,10 +15,12 @@ LABEL_ANNOTATOR = sv.LabelAnnotator()
 MASK_ANNOTATOR = sv.MaskAnnotator()
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 VIDEO_TARGET_DIRECTORY = "tmp"
 INTRO_TEXT = """
-## PaliGemma 2 Detection with Supervision - Demo
 <div style="display: flex; gap: 10px;">
 <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
@@ -60,6 +63,14 @@ def parse_class_names(prompt):
     classes_text = prompt[7:].strip()
     return [cls.strip() for cls in classes_text.split(';') if cls.strip()]
 @spaces.GPU
 def paligemma_detection(input_image, input_text, max_new_tokens):
     model_inputs = processor(text=input_text,
@@ -110,10 +121,58 @@ def annotate_image(result, resolution_wh, prompt, cv_image):
 def process_image(input_image, input_text, max_new_tokens):
     cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
-    result = paligemma_detection(input_image, input_text, max_new_tokens)
-    annotated_image = annotate_image(result,
-                                     (input_image.width, input_image.height),
-                                     input_text, cv_image)
     return annotated_image, result
@@ -188,13 +247,13 @@ def process_video(input_video, input_text, max_new_tokens, progress=gr.Progress(
 with gr.Blocks() as app:
     gr.Markdown(INTRO_TEXT)
-    with gr.Tab("Image Detection"):
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(type="pil", label="Input Image")
                 input_text = gr.Textbox(
                     lines=2,
-                    placeholder="Enter prompt in format like this: detect person;dog;building",
                     label="Enter detection prompt"
                 )
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
@@ -213,7 +272,7 @@ with gr.Blocks() as app:
                 input_video = gr.Video(label="Input Video")
                 input_text = gr.Textbox(
                     lines=2,
-                    placeholder="Enter prompt in format like this: detect person;dog;building",
                     label="Enter detection prompt"
                 )
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")

 from PIL import Image
 import gradio as gr
 import spaces
+from helpers.file_utils import create_directory, delete_directory, generate_unique_name
+from helpers.segment_utils import parse_segmentation, extract_objs
 import os
 BOX_ANNOTATOR = sv.BoxAnnotator()
 MASK_ANNOTATOR = sv.MaskAnnotator()
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 VIDEO_TARGET_DIRECTORY = "tmp"
+VAE_MODEL = "vae-oid.npz"
+COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
 INTRO_TEXT = """
+## PaliGemma 2 Detection/Segmentation with Supervision - Demo
 <div style="display: flex; gap: 10px;">
 <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
     classes_text = prompt[7:].strip()
     return [cls.strip() for cls in classes_text.split(';') if cls.strip()]
+def parse_prompt_type(prompt):
+    """Determine if the prompt is for detection or segmentation."""
+    if prompt.lower().startswith('detect '):
+        return 'detection', prompt[7:].strip()
+    elif prompt.lower().startswith('segment '):
+        return 'segmentation', prompt[8:].strip()
+    return None, prompt
 @spaces.GPU
 def paligemma_detection(input_image, input_text, max_new_tokens):
     model_inputs = processor(text=input_text,
 def process_image(input_image, input_text, max_new_tokens):
     cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
+    prompt_type, cleaned_prompt = parse_prompt_type(input_text)
+    if prompt_type == 'detection':
+        # Existing detection logic
+        result = paligemma_detection(input_image, input_text, max_new_tokens)
+        class_names = [cls.strip() for cls in cleaned_prompt.split(';') if cls.strip()]
+        detections = sv.Detections.from_lmm(
+            sv.LMM.PALIGEMMA,
+            result,
+            resolution_wh=(input_image.width, input_image.height),
+            classes=class_names
+        )
+        annotated_image = BOX_ANNOTATOR.annotate(scene=cv_image.copy(), detections=detections)
+        annotated_image = LABEL_ANNOTATOR.annotate(scene=annotated_image, detections=detections)
+        annotated_image = MASK_ANNOTATOR.annotate(scene=annotated_image, detections=detections)
+    elif prompt_type == 'segmentation':
+        # Use parse_segmentation for segmentation tasks
+        result = paligemma_detection(input_image, input_text, max_new_tokens)
+        input_image, annotations = parse_segmentation(input_image, result)
+        # Create annotated image
+        annotated_image = cv_image.copy()
+        for mask, label in annotations:
+            if isinstance(mask, np.ndarray):  # If it's a segmentation mask
+                # Create colored mask
+                color_idx = hash(label) % len(COLORS)
+                color = tuple(int(COLORS[color_idx].lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
+                colored_mask = np.zeros_like(cv_image)
+                colored_mask[mask > 0] = color
+                # Blend mask with image
+                alpha = 0.5
+                annotated_image = cv2.addWeighted(annotated_image, 1, colored_mask, alpha, 0)
+                # Add label where mask starts
+                y_coords, x_coords = np.where(mask > 0)
+                if len(y_coords) > 0 and len(x_coords) > 0:
+                    label_y = y_coords.min()
+                    label_x = x_coords.min()
+                    cv2.putText(annotated_image, label, (label_x, label_y-10),
+                              cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
+    else:
+        gr.Warning("Invalid prompt format. Please use 'detect' or 'segment' followed by class names")
+        return input_image, "Invalid prompt format"
+    # Convert back to RGB for display
+    annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
+    annotated_image = Image.fromarray(annotated_image)
     return annotated_image, result
 with gr.Blocks() as app:
     gr.Markdown(INTRO_TEXT)
+    with gr.Tab("Image Detection/Segmentation"):
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(type="pil", label="Input Image")
                 input_text = gr.Textbox(
                     lines=2,
+                    placeholder="Enter prompt in format like this: detect person;dog;building or segment person;dog;building",
                     label="Enter detection prompt"
                 )
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
                 input_video = gr.Video(label="Input Video")
                 input_text = gr.Textbox(
                     lines=2,
+                    placeholder="Enter prompt in format like this: detect person;dog;building or segment person;dog;building",
                     label="Enter detection prompt"
                 )
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")

helpers/{utils.py → file_utils.py} RENAMED Viewed

File without changes

helpers/segment_utils.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import re
+import numpy as np
+import functools
+from PIL import Image
+### Postprocessing Utils for Segmentation Tokens
+### Segmentation tokens are passed to another VAE which decodes them to a mask
+_MODEL_PATH = 'vae-oid.npz'
+_SEGMENT_DETECT_RE = re.compile(
+    r'(.*?)' +
+    r'<loc(\d{4})>' * 4 + r'\s*' +
+    '(?:%s)?' % (r'<seg(\d{3})>' * 16) +
+    r'\s*([^;<>]+)? ?(?:; )?',
+)
+COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
+def parse_segmentation(input_image,inference_output):
+  objs = extract_objs(inference_output.lstrip("\n"), input_image.size[0], input_image.size[1], unique_labels=True)
+  labels = set(obj.get('name') for obj in objs if obj.get('name'))
+  color_map = {l: COLORS[i % len(COLORS)] for i, l in enumerate(labels)}
+  highlighted_text = [(obj['content'], obj.get('name')) for obj in objs]
+  annotated_img = (
+    input_image,
+    [
+        (
+            obj['mask'] if obj.get('mask') is not None else obj['xyxy'],
+            obj['name'] or '',
+        )
+        for obj in objs
+        if 'mask' in obj or 'xyxy' in obj
+    ],
+  )
+  has_annotations = bool(annotated_img[1])
+  return annotated_img
+def _get_params(checkpoint):
+  """Converts PyTorch checkpoint to Flax params."""
+  def transp(kernel):
+    return np.transpose(kernel, (2, 3, 1, 0))
+  def conv(name):
+    return {
+        'bias': checkpoint[name + '.bias'],
+        'kernel': transp(checkpoint[name + '.weight']),
+    }
+  def resblock(name):
+    return {
+        'Conv_0': conv(name + '.0'),
+        'Conv_1': conv(name + '.2'),
+        'Conv_2': conv(name + '.4'),
+    }
+  return {
+      '_embeddings': checkpoint['_vq_vae._embedding'],
+      'Conv_0': conv('decoder.0'),
+      'ResBlock_0': resblock('decoder.2.net'),
+      'ResBlock_1': resblock('decoder.3.net'),
+      'ConvTranspose_0': conv('decoder.4'),
+      'ConvTranspose_1': conv('decoder.6'),
+      'ConvTranspose_2': conv('decoder.8'),
+      'ConvTranspose_3': conv('decoder.10'),
+      'Conv_1': conv('decoder.12'),
+  }
+def _quantized_values_from_codebook_indices(codebook_indices, embeddings):
+  batch_size, num_tokens = codebook_indices.shape
+  assert num_tokens == 16, codebook_indices.shape
+  unused_num_embeddings, embedding_dim = embeddings.shape
+  encodings = jnp.take(embeddings, codebook_indices.reshape((-1)), axis=0)
+  encodings = encodings.reshape((batch_size, 4, 4, embedding_dim))
+  return encodings
+@functools.cache
+def _get_reconstruct_masks():
+  """Reconstructs masks from codebook indices.
+  Returns:
+    A function that expects indices shaped `[B, 16]` of dtype int32, each
+    ranging from 0 to 127 (inclusive), and that returns a decoded masks sized
+    `[B, 64, 64, 1]`, of dtype float32, in range [-1, 1].
+  """
+  class ResBlock(nn.Module):
+    features: int
+    @nn.compact
+    def __call__(self, x):
+      original_x = x
+      x = nn.Conv(features=self.features, kernel_size=(3, 3), padding=1)(x)
+      x = nn.relu(x)
+      x = nn.Conv(features=self.features, kernel_size=(3, 3), padding=1)(x)
+      x = nn.relu(x)
+      x = nn.Conv(features=self.features, kernel_size=(1, 1), padding=0)(x)
+      return x + original_x
+  class Decoder(nn.Module):
+    """Upscales quantized vectors to mask."""
+    @nn.compact
+    def __call__(self, x):
+      num_res_blocks = 2
+      dim = 128
+      num_upsample_layers = 4
+      x = nn.Conv(features=dim, kernel_size=(1, 1), padding=0)(x)
+      x = nn.relu(x)
+      for _ in range(num_res_blocks):
+        x = ResBlock(features=dim)(x)
+      for _ in range(num_upsample_layers):
+        x = nn.ConvTranspose(
+            features=dim,
+            kernel_size=(4, 4),
+            strides=(2, 2),
+            padding=2,
+            transpose_kernel=True,
+        )(x)
+        x = nn.relu(x)
+        dim //= 2
+      x = nn.Conv(features=1, kernel_size=(1, 1), padding=0)(x)
+      return x
+  def reconstruct_masks(codebook_indices):
+    quantized = _quantized_values_from_codebook_indices(
+        codebook_indices, params['_embeddings']
+    )
+    return Decoder().apply({'params': params}, quantized)
+  with open(_MODEL_PATH, 'rb') as f:
+    params = _get_params(dict(np.load(f)))
+  return jax.jit(reconstruct_masks, backend='cpu')
+def extract_objs(text, width, height, unique_labels=False):
+  """Returns objs for a string with "<loc>" and "<seg>" tokens."""
+  objs = []
+  seen = set()
+  while text:
+    m = _SEGMENT_DETECT_RE.match(text)
+    if not m:
+      break
+    print("m", m)
+    gs = list(m.groups())
+    before = gs.pop(0)
+    name = gs.pop()
+    y1, x1, y2, x2 = [int(x) / 1024 for x in gs[:4]]
+    y1, x1, y2, x2 = map(round, (y1*height, x1*width, y2*height, x2*width))
+    seg_indices = gs[4:20]
+    if seg_indices[0] is None:
+      mask = None
+    else:
+      seg_indices = np.array([int(x) for x in seg_indices], dtype=np.int32)
+      m64, = _get_reconstruct_masks()(seg_indices[None])[..., 0]
+      m64 = np.clip(np.array(m64) * 0.5 + 0.5, 0, 1)
+      m64 = Image.fromarray((m64 * 255).astype('uint8'))
+      mask = np.zeros([height, width])
+      if y2 > y1 and x2 > x1:
+        mask[y1:y2, x1:x2] = np.array(m64.resize([x2 - x1, y2 - y1])) / 255.0
+    content = m.group()
+    if before:
+      objs.append(dict(content=before))
+      content = content[len(before):]
+    while unique_labels and name in seen:
+      name = (name or '') + "'"
+    seen.add(name)
+    objs.append(dict(
+        content=content, xyxy=(x1, y1, x2, y2), mask=mask, name=name))
+    text = text[len(before) + len(content):]
+  if text:
+    objs.append(dict(content=text))
+  return objs
+#########

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ transformers==4.47.0
 requests
 tqdm
 spaces
-torch

 requests
 tqdm
 spaces
+torch
+jax
+flax