""" This module provides an interface for image captioning using the BLIP model. The interface allows users to upload an image and receive a caption. """ import gradio as gr import spaces from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image # Initialize the processor and model processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = ( BlipForConditionalGeneration .from_pretrained("Salesforce/blip-image-captioning-base") .to("cuda") ) def generate_caption(image: Image) -> str: """ Generates a caption for a given image using the BLIP model. Args: image (Image): The input image as a PIL Image object. Returns: str: The generated caption. """ inputs = processor(images=image, return_tensors="pt").to("cuda") outputs = model.generate(**inputs) caption = processor.decode(outputs[0], skip_special_tokens=True) return caption @spaces.GPU def caption_image(image: Image) -> str: """ Takes a PIL Image input and returns a caption. Args: image (Image): The input image as a PIL Image object. Returns: str: The generated caption or an error message. """ try: return generate_caption(image) except Exception as e: return f"An error occurred: {str(e)}" # Define the Gradio interface demo = gr.Interface( fn=caption_image, inputs=gr.Image(type="pil"), outputs="text", title="Image Captioning with BLIP", description="Upload an image to generate a caption." ) # Launch the interface demo.launch()