import torch import requests from io import BytesIO from PIL import Image from transformers import AutoProcessor, AutoModelForVision2Seq import gradio as gr def load_models(): # Load pre-trained models device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(device) processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") return model, processor def generate_description(image): model, processor = load_models() prompt = "An image of" inputs = processor(text=prompt, images=image, padding='max_length', truncation=True, return_tensors="pt") # Move tensors to GPU if available inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate description generated_ids = model.generate(**inputs, max_new_tokens=128) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() return generated_text if __name__ == '__main__': interface = gr.Interface( generate_description, ["image"], "text", title="GPT-based Visual Storytelling", description="Upload an image to get a detailed caption generated by our powerful AI!", examples=[ ['PRO-b0fe1914d67344d98e120a19cd1aadf1.jpg'] ], ) interface.launch()