Spaces:

Madhuri123
/

s2

Sleeping

s2 / app.py

Update app.py

aadd242 verified about 2 months ago

1.38 kB

	import streamlit as st
	from transformers import pipeline
	from PIL import Image
	import torch

	# Load Hugging Face token
	HF_TOKEN = st.secrets["hf_token"]

	# Load the model and pipeline
	model_id = "meta-llama/Llama-3.2-11B-Vision"

	# Initialize pipeline
	pipeline = pipeline(
	"text-to-image-and-text", # Hypothetical task name for multimodal processing
	model=model_id,
	model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN}
	)

	# Streamlit UI
	st.title("Multimodal LLM Inference")
	st.write(f"Using model: {model_id}")

	# Text Input
	input_text = st.text_input("Enter your prompt:")

	# Image Input
	uploaded_file = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])

	if st.button("Generate"):
	if input_text and uploaded_file:
	# Process image
	image = Image.open(uploaded_file)

	# Prepare multimodal input
	messages = [
	{"role": "system", "content": "You are a multimodal assistant."},
	{"role": "user", "content": input_text, "image": image}
	]

	# Generate response
	response = pipeline(messages, max_new_tokens=30)

	# Display results
	st.write("Generated Response:")
	st.write(response[0]['generated_text'][-1]['content']) # Assuming this structure
	else:
	st.error("Please enter a prompt and upload an image.")