import spaces from pip._internal import main main(['install', 'timm==1.0.8']) import timm print("installed", timm.__version__) import gradio as gr from inference import sam_preprocess, beit3_preprocess from model.evf_sam import EvfSamModel from transformers import AutoTokenizer import torch import numpy as np import sys import os version = "YxZhang/evf-sam-multitask" model_type = "ori" tokenizer = AutoTokenizer.from_pretrained( version, padding_side="right", use_fast=False, ) kwargs = { "torch_dtype": torch.half, } model = EvfSamModel.from_pretrained(version, low_cpu_mem_usage=True, **kwargs).eval() model.to('cuda') @spaces.GPU @torch.no_grad() def pred(image_np, prompt, semantic_type): original_size_list = [image_np.shape[:2]] image_beit = beit3_preprocess(image_np, 224).to(dtype=model.dtype, device=model.device) image_sam, resize_shape = sam_preprocess(image_np, model_type=model_type) image_sam = image_sam.to(dtype=model.dtype, device=model.device) if semantic_type: prompt = "[semantic] " + prompt input_ids = tokenizer( prompt, return_tensors="pt")["input_ids"].to(device=model.device) # infer pred_mask = model.inference( image_sam.unsqueeze(0), image_beit.unsqueeze(0), input_ids, resize_list=[resize_shape], original_size_list=original_size_list, ) pred_mask = pred_mask.detach().cpu().numpy()[0] pred_mask = pred_mask > 0 visualization = image_np.copy() visualization[pred_mask] = (image_np * 0.5 + pred_mask[:, :, None].astype(np.uint8) * np.array([50, 120, 220]) * 0.5)[pred_mask] return visualization / 255.0, pred_mask.astype(np.float16) desc = """
EVF-SAM extends SAM's capabilities with text-prompted segmentation, achieving high accuracy in Referring Expression Segmentation.