import gradio as gr from PIL import Image import torch from transformers import BlipProcessor, BlipForConditionalGeneration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor = BlipProcessor.from_pretrained("noamrot/FuseCap_Image_Captioning") model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap_Image_Captioning").to(device) def inference(raw_image): text = "a picture of " inputs = processor(raw_image, text, return_tensors="pt").to(device) out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) return caption inputs = [gr.Image(type='pil', interactive=False),] # outputs = gr.outputs.Textbox(label="Caption") outputs = gr.Textbox(label="Caption") description = "Gradio demo for FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. This demo features a BLIP-based model, trained using FuseCap." examples = [["surfer.jpg"], ["bike.jpg"]] article = "

FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions" iface = gr.Interface(fn=inference, inputs="image", outputs="text", title="FuseCap", description=description, article=article, examples=examples, # enable_queue=True ) iface.queue() iface.launch()