import streamlit as st from PIL import Image from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-large") enable = st.checkbox("Enable camera") picture = st.camera_input("Take a picture", disabled=not enable) if picture: raw_image = Image.open(picture) # conditional image captioning text = "A view of a person in" inputs = processor(raw_image, text, return_tensors="pt") out = model.generate(**inputs) st.write(processor.decode(out[0], skip_special_tokens=True))