transformers==4.45.2
import gradio as gr
import transformers
from transformers import MobileViTFeatureExtractor, MobileViTForSemanticSegmentation
from PIL import Image
import numpy as np

feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-small")
model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

#(21 classes)
COLORS = np.array([
    [0, 0, 0],      
    [128, 0, 0], 
    [0, 128, 0],    
    [128, 128, 0],  
    [0, 0, 128],   
    [128, 0, 128],  
    [0, 128, 128],  
    [128, 128, 128],
    [64, 0, 0],     
    [192, 0, 0],    
    [64, 128, 0],   
    [192, 128, 0], 
    [64, 0, 128],   
    [192, 0, 128],  
    [64, 128, 128], 
    [192, 128, 128],
    [0, 64, 0],     
    [128, 64, 0],   
    [0, 192, 0],    
    [128, 192, 0],  
    [0, 64, 128],   
    [128, 64, 128]  
], dtype=np.uint8)  # Ensure the data type is uint8 for image processing

def segment_image(image):
    inputs = feature_extractor(images=image, return_tensors="pt")
    
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_mask = logits.argmax(1).squeeze(0).numpy()
    
    colored_mask = COLORS[predicted_mask]
    colored_mask_image = Image.fromarray(colored_mask)
    colored_mask_resized = colored_mask_image.resize(image.size, Image.NEAREST)
    
    return colored_mask_resized

interface = gr.Interface(
    fn=segment_image,  
    inputs=gr.Image(type="pil"),  
    outputs="image",  
    title="Image Segmentation with MobileViT",
    description="Upload an image to see the semantic segmentation result. The segmentation mask uses different colors to indicate different classes.",
)

interface.launch(share=True)