Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
from PIL import Image | |
import torch | |
# Load Hugging Face token | |
HF_TOKEN = st.secrets["hf_token"] | |
# Load the model and pipeline | |
model_id = "meta-llama/Llama-3.2-11B-Vision" | |
# Initialize pipeline | |
pipeline = pipeline( | |
"text-to-image-and-text", # Hypothetical task name for multimodal processing | |
model=model_id, | |
model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN} | |
) | |
# Streamlit UI | |
st.title("Multimodal LLM Inference") | |
st.write(f"**Using model:** {model_id}") | |
# Text Input | |
input_text = st.text_input("Enter your prompt:") | |
# Image Input | |
uploaded_file = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"]) | |
if st.button("Generate"): | |
if input_text and uploaded_file: | |
# Process image | |
image = Image.open(uploaded_file) | |
# Prepare multimodal input | |
messages = [ | |
{"role": "system", "content": "You are a multimodal assistant."}, | |
{"role": "user", "content": input_text, "image": image} | |
] | |
# Generate response | |
response = pipeline(messages, max_new_tokens=30) | |
# Display results | |
st.write("Generated Response:") | |
st.write(response[0]['generated_text'][-1]['content']) # Assuming this structure | |
else: | |
st.error("Please enter a prompt and upload an image.") | |