import torch
import gradio as gr
from transformers import pipeline

CAPTION_MODELS = {
    'blip-base': 'Salesforce/blip-image-captioning-base',
    'blip-large': 'Salesforce/blip-image-captioning-large',
    'vit-gpt2-coco-en': 'ydshieh/vit-gpt2-coco-en',
    'blip2-2.7b-fp16': 'Mediocreatmybest/blip2-opt-2.7b-fp16-sharded',
}

# Simple caption creation
def caption_image(model_choice, image_path):
    captioner = pipeline(task="image-to-text",
                         model=CAPTION_MODELS[model_choice],
                         max_new_tokens=30,
                         device_map="cpu", use_fast=True
                         )
    caption = captioner(image_path)[0]['generated_text']
    return str(caption).strip()

def launch(model_choice, input):
    return caption_image(model_choice, input)

model_dropdown = gr.inputs.Dropdown(choices=list(CAPTION_MODELS.keys()), label='Model Choice')
iface = gr.Interface(launch, inputs=[model_dropdown, "text"], outputs="text")
iface.launch()