import torch
from PIL import Image
import gradio as gr

from lavis.models import load_model_and_preprocess

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, vis_processors, _ = load_model_and_preprocess(
    name="blip_caption", model_type="large_coco", is_eval=True, device=device
)

def predict(image):
    pre_processed_image = vis_processors["eval"](image).unsqueeze(0).to(device)
    response = model.generate({"image": pre_processed_image}, use_nucleus_sampling=True, num_captions=3)
    return image, "\n".join(response)

demo = gr.Interface(
    title="Image Captioning - BLIP",
    fn=predict,
    inputs=gr.Image(type='pil', label="Original Image"),
    outputs=[gr.Image(type="pil"), gr.Textbox()],
    examples=["example_1.jpg", "example_2.jpg", "example_3.jpg"],
)

demo.launch()