Spaces:

VinitT
/

Llama-3.2-11B-Vision-Instruct

Running

App Files Files Community

VinitT

diabolic6045 commited on Oct 11

Commit

8f9661d

•

1 Parent(s): 52846d8

Update app.py (#1)

Browse files

- Update app.py (65316115944ac1ab429fc17bbf8058204d0cc683)

Co-authored-by: Divax Shah <diabolic6045@users.noreply.huggingface.co>

Files changed (1) hide show

app.py +56 -32

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import os
 import streamlit as st
 from huggingface_hub import login
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
-import requests
 import torch
 # Step 1: Log in to Hugging Face with your access token from secrets
@@ -13,57 +12,82 @@ if huggingface_token:
 else:
     st.error("Hugging Face token not found. Please set it in the Secrets section.")
-# Step 2: Load the model and tokenizer
-model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"  # Adjust if needed
 try:
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    st.success("Model loaded successfully!")
 except Exception as e:
-    st.error(f"Error loading model: {str(e)}")
 # Step 3: Create a simple Streamlit app
 def main():
     st.title("Llama 3.2 11B Vision Model")
     st.write("Upload an image and enter a prompt to generate output.")
     # Upload image
     image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
     prompt = st.text_area("Enter your prompt here:")
     if st.button("Generate Output"):
         if image_file and prompt:
             # Load image
-            image = Image.open(image_file)
             st.image(image, caption="Uploaded Image", use_column_width=True)
-            # Preprocess the image if needed (convert to tensor, etc.)
-            # This depends on how the model expects the image input
-            # Example of converting image to a format suitable for the model
-            # Note: Adjust this part based on your model's requirements.
-            # Here, we're just using a placeholder for the model input.
-            # You might need to resize or normalize the image based on the model's requirements.
-            # For example:
-            # image_tensor = preprocess_image(image)
             try:
-                # Prepare the input for the model
-                inputs = tokenizer(prompt, return_tensors='pt')
-                # Perform inference
-                # Adjust the input format for the model accordingly
-                # Here we assume the model takes a prompt and an image (adjust as necessary)
                 with torch.no_grad():
-                    model_output = model.generate(**inputs)  # Pass image tensor if required
                 # Decode the output
-                output_text = tokenizer.decode(model_output[0], skip_special_tokens=True)
-                st.write("Generated Output:", output_text)
             except Exception as e:
                 st.error(f"Error during prediction: {str(e)}")
         else:
             st.warning("Please upload an image and enter a prompt.")
 if __name__ == "__main__":
-    main()

 import os
 import streamlit as st
 from huggingface_hub import login
+from transformers import MllamaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import torch
 # Step 1: Log in to Hugging Face with your access token from secrets
 else:
     st.error("Hugging Face token not found. Please set it in the Secrets section.")
+# Step 2: Load the model and processor
 try:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    model = MllamaForConditionalGeneration.from_pretrained(
+        model_name,
+        use_auth_token=huggingface_token,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    processor = AutoProcessor.from_pretrained(
+        model_name,
+        use_auth_token=huggingface_token,
+    )
+    st.success("Model and processor loaded successfully!")
 except Exception as e:
+    st.error(f"Error loading model or processor: {str(e)}")
 # Step 3: Create a simple Streamlit app
 def main():
     st.title("Llama 3.2 11B Vision Model")
     st.write("Upload an image and enter a prompt to generate output.")
     # Upload image
     image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
     prompt = st.text_area("Enter your prompt here:")
     if st.button("Generate Output"):
         if image_file and prompt:
             # Load image
+            image = Image.open(image_file).convert("RGB")
             st.image(image, caption="Uploaded Image", use_column_width=True)
             try:
+                # Prepare the messages in the format expected by the processor
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "image"}
+                        ]
+                    }
+                ]
+                # Apply chat template
+                input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+                # Prepare inputs for the model
+                inputs = processor(
+                    text=input_text,
+                    images=[image],
+                    return_tensors="pt"
+                ).to("cuda" if torch.cuda.is_available() else "cpu")
+                # Generate output
                 with torch.no_grad():
+                    output_ids = model.generate(
+                        **inputs,
+                        max_new_tokens=250,
+                    )
                 # Decode the output
+                output_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+                # Extract the generated response
+                # Remove the prompt part from the output_text
+                if input_text in output_text:
+                    generated_output = output_text.replace(input_text, "").strip()
+                else:
+                    generated_output = output_text.strip()
+                st.write("Generated Output:", generated_output)
             except Exception as e:
                 st.error(f"Error during prediction: {str(e)}")
         else:
             st.warning("Please upload an image and enter a prompt.")
 if __name__ == "__main__":
+    main()