import os import streamlit as st from huggingface_hub import login from transformers import MllamaForConditionalGeneration, AutoProcessor from PIL import Image import torch # Step 1: Log in to Hugging Face with your access token from secrets huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # Fetch the token from environment if huggingface_token: login(token=huggingface_token) # Authenticate using the token else: st.error("Hugging Face token not found. Please set it in the Secrets section.") # Step 2: Load the model and processor try: model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_name, use_auth_token=huggingface_token, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained( model_name, use_auth_token=huggingface_token, ) st.success("Model and processor loaded successfully!") except Exception as e: st.error(f"Error loading model or processor: {str(e)}") # Step 3: Create a simple Streamlit app def main(): st.title("Llama 3.2 11B Vision Model") st.write("Upload an image and enter a prompt to generate output.") # Upload image image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) prompt = st.text_area("Enter your prompt here:") if st.button("Generate Output"): if image_file and prompt: # Load image image = Image.open(image_file).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) try: # Prepare the messages in the format expected by the processor messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image"} ] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Prepare inputs for the model inputs = processor( text=input_text, images=[image], return_tensors="pt" ).to("cuda" if torch.cuda.is_available() else "cpu") # Generate output with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=250, ) # Decode the output output_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0] # Extract the generated response # Remove the prompt part from the output_text if input_text in output_text: generated_output = output_text.replace(input_text, "").strip() else: generated_output = output_text.strip() st.write("Generated Output:", generated_output) except Exception as e: st.error(f"Error during prediction: {str(e)}") else: st.warning("Please upload an image and enter a prompt.") if __name__ == "__main__": main()