import gradio as gr import os import torch from transformers import AutoProcessor, MllamaForConditionalGeneration from PIL import Image import spaces # Check if we're running in a Hugging Face Space and if SPACES_ZERO_GPU is enabled IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" IS_SPACE = os.environ.get("SPACE_ID", None) is not None # Determine the device (GPU if available, else CPU) device = "cuda" if torch.cuda.is_available() else "cpu" LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" print(f"Using device: {device}") print(f"Low memory mode: {LOW_MEMORY}") # Get Hugging Face token from environment variables HF_TOKEN = os.environ.get('HF_TOKEN') # Load the model and processor model_name = "ruslanmv/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_name, use_auth_token=HF_TOKEN, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, device_map="auto" if device == "cuda" else None, # Use device mapping if CUDA is available ) # Move the model to the appropriate device (GPU if available) model.to(device) processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HF_TOKEN) @spaces.GPU # Use the free GPU provided by Hugging Face Spaces def predict(image, text): # Prepare the input messages messages = [ {"role": "user", "content": [ {"type": "image"}, # Specify that an image is provided {"type": "text", "text": text} # Add the user-provided text input ]} ] # Create the input text using the processor's chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process the inputs and move to the appropriate device inputs = processor(image, input_text, return_tensors="pt").to(device) # Generate a response from the model outputs = model.generate(**inputs, max_new_tokens=100) # Decode the output to return the final response response = processor.decode(outputs[0], skip_special_tokens=True) return response # Define the Gradio interface interface = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Image Input"), # Image input with label gr.Textbox(label="Text Input") # Textbox input with label ], outputs=gr.Textbox(label="Generated Response"), # Output with a more descriptive label title="Llama 3.2 11B Vision Instruct Demo", # Title of the interface description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.", # Short description theme="compact" # Using a compact theme for a cleaner look ) # Launch the interface interface.launch(debug=True)