Spaces:

Madhuri123
/

s2

Sleeping

Madhuri123 commited on Dec 10, 2024

Commit

aadd242

verified ·

1 Parent(s): f67dd23

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,36 +1,52 @@
 import streamlit as st
-import transformers
-import torch
-import requests
 from PIL import Image
-from transformers import MllamaForConditionalGeneration, AutoProcessor
-import subprocess
-subprocess.run(["pip", "install", "accelerate>=0.26.0"])
-HF_TOKEN=st.secrets["hf_token"]
 # Load the model and pipeline
 model_id = "meta-llama/Llama-3.2-11B-Vision"
-# Streamlit user interface
-st.title("LLM Model Inference")
-st.write(f"**Using model:** {model_id}")
-# Set up the pipeline with the Hugging Face token
-model = MllamaForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(model_id)
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
-inputs = processor(image, prompt, return_tensors="pt").to(model.device)
-output = model.generate(**inputs, max_new_tokens=30)
-st.write(processor.decode(output[0]))

 import streamlit as st
+from transformers import pipeline
 from PIL import Image
+import torch
+# Load Hugging Face token
+HF_TOKEN = st.secrets["hf_token"]
 # Load the model and pipeline
 model_id = "meta-llama/Llama-3.2-11B-Vision"
+# Initialize pipeline
+pipeline = pipeline(
+    "text-to-image-and-text",  # Hypothetical task name for multimodal processing
+    model=model_id,
+    model_kwargs={"torch_dtype": torch.bfloat16, "use_auth_token": HF_TOKEN}
 )
+# Streamlit UI
+st.title("Multimodal LLM Inference")
+st.write(f"**Using model:** {model_id}")
+# Text Input
+input_text = st.text_input("Enter your prompt:")
+# Image Input
+uploaded_file = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
+if st.button("Generate"):
+    if input_text and uploaded_file:
+        # Process image
+        image = Image.open(uploaded_file)
+        # Prepare multimodal input
+        messages = [
+            {"role": "system", "content": "You are a multimodal assistant."},
+            {"role": "user", "content": input_text, "image": image}
+        ]
+        # Generate response
+        response = pipeline(messages, max_new_tokens=30)
+        # Display results
+        st.write("Generated Response:")
+        st.write(response[0]['generated_text'][-1]['content'])  # Assuming this structure
+    else:
+        st.error("Please enter a prompt and upload an image.")