Spaces:

Madhuri123
/

s2

Sleeping

File size: 1,380 Bytes

fa07468
aadd242
722c6e5
aadd242
b4c3b08
aadd242
 
fa07468
cb4d59e
538b49d
fa07468
aadd242
 
 
 
 
cb4d59e
722c6e5
aadd242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722c6e5
 
fa07468
cb4d59e
fa07468

import streamlit as st
from transformers import pipeline
from PIL import Image
import torch

# Load Hugging Face token
HF_TOKEN = st.secrets["hf_token"]

# Load the model and pipeline
model_id = "meta-llama/Llama-3.2-11B-Vision"

# Initialize pipeline
pipeline = pipeline(
    "text-to-image-and-text",  # Hypothetical task name for multimodal processing
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN}
)

# Streamlit UI
st.title("Multimodal LLM Inference")
st.write(f"**Using model:** {model_id}")

# Text Input
input_text = st.text_input("Enter your prompt:")

# Image Input
uploaded_file = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])

if st.button("Generate"):
    if input_text and uploaded_file:
        # Process image
        image = Image.open(uploaded_file)

        # Prepare multimodal input
        messages = [
            {"role": "system", "content": "You are a multimodal assistant."},
            {"role": "user", "content": input_text, "image": image}
        ]

        # Generate response
        response = pipeline(messages, max_new_tokens=30)
        
        # Display results
        st.write("Generated Response:")
        st.write(response[0]['generated_text'][-1]['content'])  # Assuming this structure
    else:
        st.error("Please enter a prompt and upload an image.")