import gradio as gr import torch from PIL import Image from model import BlipBaseModel, GitBaseCocoModel MODELS = { "Git-Base-COCO": GitBaseCocoModel, "Blip Base": BlipBaseModel, } torch.hub.download_url_to_file("https://huggingface.co/datasets/chats-bug/test-image-caption-Listed/blob/main/Image1.png", "image1.png") torch.hub.download_url_to_file("https://huggingface.co/datasets/chats-bug/test-image-caption-Listed/blob/main/Image2.png", "image2.png") torch.hub.download_url_to_file("https://huggingface.co/datasets/chats-bug/test-image-caption-Listed/blob/main/Image3.png", "image3.png") examples = [["image1.png"], ["image2.png"], ["image3.png"]] def generate_captions( image, num_captions, model_name, max_length, temperature, top_k, top_p, repetition_penalty, diversity_penalty, ): """ Generates captions for the given image. ----- Parameters: image: PIL.Image The image to generate captions for. num_captions: int The number of captions to generate. ** Rest of the parameters are the same as in the model.generate method. ** ----- Returns: list[str] """ # Convert the numerical values to their corresponding types. # Gradio Slider returns values as floats: except when the value is a whole number, in which case it returns an int. # Only float values suffer from this issue. temperature = float(temperature) top_p = float(top_p) repetition_penalty = float(repetition_penalty) diversity_penalty = float(diversity_penalty) device = "cuda" if torch.cuda.is_available() else "cpu" model = MODELS[model_name](device) captions = model.generate( image=image, max_length=max_length, num_captions=num_captions, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, diversity_penalty=diversity_penalty, ) # Convert list to a single string separated by newlines. captions = "\n".join(captions) return captions title = "AI tool for generating captions for images" description = "This tool uses pretrained models to generate captions for images." interface = gr.Interface( fn=generate_captions, inputs=[ gr.components.Image(type="pil", label="Image"), gr.components.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Captions to Generate"), gr.components.Dropdown(MODELS.keys(), label="Model", value=list(MODELS.keys())[1]), # Default to Blip Base gr.components.Slider(minimum=20, maximum=100, step=5, value=50, label="Maximum Caption Length"), gr.components.Slider(minimum=0.1, maximum=10.0, step=0.1, value=1.0, label="Temperature"), gr.components.Slider(minimum=1, maximum=100, step=1, value=50, label="Top K"), gr.components.Slider(minimum=0.1, maximum=5.0, step=0.1, value=1.0, label="Top P"), gr.components.Slider(minimum=1.0, maximum=10.0, step=0.1, value=2.0, label="Repetition Penalty"), gr.components.Slider(minimum=0.0, maximum=10.0, step=0.1, value=2.0, label="Diversity Penalty"), ], outputs=[ gr.components.Textbox(label="Caption"), ], examples=examples, title=title, description=description, allow_flagging="never", ) if __name__ == "__main__": # Launch the interface. interface.launch( enable_queue=True, debug=True, )