import torch from PIL import Image import gradio as gr from lavis.models import load_model_and_preprocess device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model, vis_processors, _ = load_model_and_preprocess( name="blip_caption", model_type="large_coco", is_eval=True, device=device ) def predict(image): pre_processed_image = vis_processors["eval"](image).unsqueeze(0).to(device) response = model.generate({"image": pre_processed_image}, use_nucleus_sampling=True, num_captions=3) return image, "\n".join(response) demo = gr.Interface( title="Image Captioning - BLIP", fn=predict, inputs=gr.Image(type='pil', label="Original Image"), outputs=[gr.Image(type="pil"), gr.Textbox()], examples=["example_1.jpg", "example_2.jpg", "example_3.jpg"], ) demo.launch()