import spaces import requests from PIL import Image from io import BytesIO import torch from transformers import CLIPProcessor, CLIPModel import gradio as gr # Initialize the model and processor model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") @spaces.GPU # Use the GPU decorator for the function that requires GPU def get_embedding(image_or_text): # Define device within the function to ensure it uses the GPU when available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) if image_or_text.startswith(('http:', 'https:')): # Image URL response = requests.get(image_or_text) image = Image.open(BytesIO(response.content)) inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): features = model.get_image_features(**inputs).cpu().numpy() else: # Text input inputs = processor(text=[image_or_text], return_tensors="pt", padding=True).to(device) with torch.no_grad(): features = model.get_text_features(**inputs).cpu().numpy() return features.flatten().tolist() # Define the Gradio interface interface = gr.Interface(fn=get_embedding, inputs="text", outputs="json", title="CLIP Model Embeddings", description="Enter an Image URL or text to get embeddings from CLIP.") if __name__ == "__main__": interface.launch(share=True)