import streamlit as st from transformers import pipeline from PIL import Image import torch # Load Hugging Face token HF_TOKEN = st.secrets["hf_token"] # Load the model and pipeline model_id = "meta-llama/Llama-3.2-11B-Vision" # Initialize pipeline pipeline = pipeline( "text-to-image-and-text", # Hypothetical task name for multimodal processing model=model_id, model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN} ) # Streamlit UI st.title("Multimodal LLM Inference") st.write(f"**Using model:** {model_id}") # Text Input input_text = st.text_input("Enter your prompt:") # Image Input uploaded_file = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"]) if st.button("Generate"): if input_text and uploaded_file: # Process image image = Image.open(uploaded_file) # Prepare multimodal input messages = [ {"role": "system", "content": "You are a multimodal assistant."}, {"role": "user", "content": input_text, "image": image} ] # Generate response response = pipeline(messages, max_new_tokens=30) # Display results st.write("Generated Response:") st.write(response[0]['generated_text'][-1]['content']) # Assuming this structure else: st.error("Please enter a prompt and upload an image.")