clip-dinoiser / app.py
ariG23498's picture
ariG23498 HF staff
fix
b79f89f
raw
history blame
4.66 kB
from models.builder import build_model
from visualization import mask2rgb
from segmentation.datasets import PascalVOCDataset
import os
from hydra import compose, initialize
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms as T
import torch.nn.functional as F
import numpy as np
from operator import itemgetter
import torch
import random
import warnings
warnings.filterwarnings("ignore")
initialize(config_path="configs", version_base=None)
from huggingface_hub import Repository
repo = Repository(
local_dir="clip-dinoiser",
clone_from="ariG23498/clip-dinoiser",
use_auth_token=os.environ.get("token")
)
check_path = 'clip-dinoiser/checkpoints/last.pt'
device = "cuda" if torch.cuda.is_available() else "cpu"
check = torch.load(check_path, map_location=device)
dinoclip_cfg = "clip_dinoiser.yaml"
cfg = compose(config_name=dinoclip_cfg)
model = build_model(cfg.model, class_names=PascalVOCDataset.CLASSES).to(device)
model.clip_backbone.decode_head.use_templates=False # switching off the imagenet templates for fast inference
model.load_state_dict(check['model_state_dict'], strict=False)
model = model.eval()
import gradio as gr
colors = [
(0, 255, 0),
(0, 0, 255),
(255, 255, 0),
(255, 0, 255),
(0, 255, 255),
(114, 128, 250),
(0, 165, 255),
(0, 128, 0),
(144, 238, 144),
(238, 238, 175),
(255, 191, 0),
(0, 128, 0),
(226, 43, 138),
(255, 0, 255),
(0, 215, 255),
(255, 0, 0),
]
color_map = {
f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(colors)
}
def run_clip_dinoiser(input_image, text_prompts):
image = input_image.convert("RGB")
text_prompts = text_prompts.split(",")
palette = colors[:len(text_prompts)]
model.clip_backbone.decode_head.update_vocab(text_prompts)
model.to(device)
model.apply_found = True
img_tens = T.PILToTensor()(image).unsqueeze(0).to(device) / 255.
h, w = img_tens.shape[-2:]
output = model(img_tens).cpu()
output = F.interpolate(output, scale_factor=model.clip_backbone.backbone.patch_size, mode="bilinear",
align_corners=False)[..., :h, :w]
output = output[0].argmax(dim=0)
mask = mask2rgb(output, palette)
# fig = plt.figure(figsize=(3, 1))
# classes = np.unique(output).tolist()
# plt.imshow(np.array(itemgetter(*classes)(palette)).reshape(1, -1, 3))
# plt.xticks(np.arange(len(classes)), list(itemgetter(*classes)(text_prompts)), rotation=45)
# plt.yticks([])
# fig, ax = plt.subplots(nrows=1, ncols=2)
# alpha=0.5
# blend = (alpha)*np.array(image)/255. + (1-alpha) * mask/255.
# ax[0].imshow(blend)
# ax[1].imshow(mask)
# ax[0].axis('off')
# ax[1].axis('off')
classes = np.unique(output).tolist()
palette_array = np.array(itemgetter(*classes)(palette)).reshape(1, -1, 3)
alpha=0.5
blend = (alpha)*np.array(image)/255. + (1-alpha) * mask/255.
h_text = list()
for idx, text in enumerate(text_prompts):
for alphabet in text:
h_text.append((alphabet, color_map[str(idx)]))
return blend, mask, h_text
if __name__ == "__main__":
block = gr.Blocks().queue()
with block:
gr.Markdown("<h1><center>CLIP-DINOiser<h1><center>")
with gr.Row():
with gr.Column():
input_image = gr.Image(type="pil")
text_prompts = gr.Textbox(label="Enter comma-separated prompts")
run_button = gr.Button(value="Run")
with gr.Column():
with gr.Row():
overlay_mask = gr.Image(
type="numpy",
)
only_mask = gr.Image(
type="numpy",
)
h_text = gr.HighlightedText(
label="text",
combine_adjacent=False,
show_legend=False,
color_map=color_map
)
run_button.click(
fn=run_clip_dinoiser,
inputs=[input_image, text_prompts,],
outputs=[overlay_mask, only_mask, h_text]
)
gr.Examples(
[["vintage_bike.jpeg", "background, vintage bike, leather bag"]],
inputs = [input_image, text_prompts,],
outputs = [overlay_mask, only_mask, h_text],
fn=run_clip_dinoiser,
cache_examples=True,
label='Try this example input!'
)
block.launch(share=False, show_api=False, show_error=True)