Spaces:
Sleeping
Sleeping
from models.builder import build_model | |
from visualization import mask2rgb | |
from segmentation.datasets import PascalVOCDataset | |
import os | |
from hydra import compose, initialize | |
from PIL import Image | |
import matplotlib.pyplot as plt | |
from torchvision import transforms as T | |
import torch.nn.functional as F | |
import numpy as np | |
from operator import itemgetter | |
import torch | |
import random | |
import warnings | |
warnings.filterwarnings("ignore") | |
initialize(config_path="configs", version_base=None) | |
from huggingface_hub import Repository | |
repo = Repository( | |
local_dir="clip-dinoiser", | |
clone_from="ariG23498/clip-dinoiser", | |
use_auth_token=os.environ.get("token") | |
) | |
check_path = 'clip-dinoiser/checkpoints/last.pt' | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
check = torch.load(check_path, map_location=device) | |
dinoclip_cfg = "clip_dinoiser.yaml" | |
cfg = compose(config_name=dinoclip_cfg) | |
model = build_model(cfg.model, class_names=PascalVOCDataset.CLASSES).to(device) | |
model.clip_backbone.decode_head.use_templates=False # switching off the imagenet templates for fast inference | |
model.load_state_dict(check['model_state_dict'], strict=False) | |
model = model.eval() | |
import gradio as gr | |
colors = [ | |
(0, 255, 0), | |
(0, 0, 255), | |
(255, 255, 0), | |
(255, 0, 255), | |
(0, 255, 255), | |
(114, 128, 250), | |
(0, 165, 255), | |
(0, 128, 0), | |
(144, 238, 144), | |
(238, 238, 175), | |
(255, 191, 0), | |
(0, 128, 0), | |
(226, 43, 138), | |
(255, 0, 255), | |
(0, 215, 255), | |
(255, 0, 0), | |
] | |
color_map = { | |
f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(colors) | |
} | |
def run_clip_dinoiser(input_image, text_prompts): | |
image = input_image.convert("RGB") | |
text_prompts = text_prompts.split(",") | |
palette = colors[:len(text_prompts)] | |
model.clip_backbone.decode_head.update_vocab(text_prompts) | |
model.to(device) | |
model.apply_found = True | |
img_tens = T.PILToTensor()(image).unsqueeze(0).to(device) / 255. | |
h, w = img_tens.shape[-2:] | |
output = model(img_tens).cpu() | |
output = F.interpolate(output, scale_factor=model.clip_backbone.backbone.patch_size, mode="bilinear", | |
align_corners=False)[..., :h, :w] | |
output = output[0].argmax(dim=0) | |
mask = mask2rgb(output, palette) | |
# fig = plt.figure(figsize=(3, 1)) | |
# classes = np.unique(output).tolist() | |
# plt.imshow(np.array(itemgetter(*classes)(palette)).reshape(1, -1, 3)) | |
# plt.xticks(np.arange(len(classes)), list(itemgetter(*classes)(text_prompts)), rotation=45) | |
# plt.yticks([]) | |
# fig, ax = plt.subplots(nrows=1, ncols=2) | |
# alpha=0.5 | |
# blend = (alpha)*np.array(image)/255. + (1-alpha) * mask/255. | |
# ax[0].imshow(blend) | |
# ax[1].imshow(mask) | |
# ax[0].axis('off') | |
# ax[1].axis('off') | |
classes = np.unique(output).tolist() | |
palette_array = np.array(itemgetter(*classes)(palette)).reshape(1, -1, 3) | |
alpha=0.5 | |
blend = (alpha)*np.array(image)/255. + (1-alpha) * mask/255. | |
h_text = list() | |
for idx, text in enumerate(text_prompts): | |
for alphabet in text: | |
h_text.append((alphabet, color_map[str(idx)])) | |
return blend, mask, h_text | |
if __name__ == "__main__": | |
block = gr.Blocks().queue() | |
with block: | |
gr.Markdown("<h1><center>CLIP-DINOiser<h1><center>") | |
with gr.Row(): | |
with gr.Column(): | |
input_image = gr.Image(type="pil") | |
text_prompts = gr.Textbox(label="Enter comma-separated prompts") | |
run_button = gr.Button(value="Run") | |
with gr.Column(): | |
with gr.Row(): | |
overlay_mask = gr.Image( | |
type="numpy", | |
) | |
only_mask = gr.Image( | |
type="numpy", | |
) | |
h_text = gr.HighlightedText( | |
label="text", | |
combine_adjacent=False, | |
show_legend=False, | |
color_map=color_map | |
) | |
run_button.click( | |
fn=run_clip_dinoiser, | |
inputs=[input_image, text_prompts,], | |
outputs=[overlay_mask, only_mask, h_text] | |
) | |
gr.Examples( | |
[["vintage_bike.jpeg", "background, vintage bike, leather bag"]], | |
inputs = [input_image, text_prompts,], | |
outputs = [overlay_mask, only_mask, h_text], | |
fn=run_clip_dinoiser, | |
cache_examples=True, | |
label='Try this example input!' | |
) | |
block.launch(share=False, show_api=False, show_error=True) |