import streamlit as st from PIL import Image import torch from transformers import Blip2Processor, Blip2ForConditionalGeneration processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") model = Blip2ForConditionalGeneration.from_pretrained( "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16 ) enable = st.checkbox("Enable camera") picture = st.camera_input("Take a picture", disabled=not enable) if picture: image = Image.open(picture) prompt = "Question: At what location is this person most likely attending this online meeting? Answer:" inputs = processor(images=image, text=prompt, return_tensors="pt") generated_ids = model.generate(**inputs) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() st.write(generated_text)