Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import os | |
import torch | |
from transformers import AutoProcessor, MllamaForConditionalGeneration | |
from PIL import Image | |
import spaces | |
# Check if we're running in a Hugging Face Space and if SPACES_ZERO_GPU is enabled | |
IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" | |
IS_SPACE = os.environ.get("SPACE_ID", None) is not None | |
# Determine the device (GPU if available, else CPU) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" | |
print(f"Using device: {device}") | |
print(f"Low memory mode: {LOW_MEMORY}") | |
# Get Hugging Face token from environment variables | |
HF_TOKEN = os.environ.get('HF_TOKEN') | |
# Load the model and processor | |
model_name = "ruslanmv/Llama-3.2-11B-Vision-Instruct" | |
model = MllamaForConditionalGeneration.from_pretrained( | |
model_name, | |
use_auth_token=HF_TOKEN, | |
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, | |
device_map="auto" if device == "cuda" else None, # Use device mapping if CUDA is available | |
) | |
# Move the model to the appropriate device (GPU if available) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HF_TOKEN) | |
# Use the free GPU provided by Hugging Face Spaces | |
def predict(image, text): | |
# Prepare the input messages | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "image"}, # Specify that an image is provided | |
{"type": "text", "text": text} # Add the user-provided text input | |
]} | |
] | |
# Create the input text using the processor's chat template | |
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
# Process the inputs and move to the appropriate device | |
inputs = processor(image, input_text, return_tensors="pt").to(device) | |
# Generate a response from the model | |
outputs = model.generate(**inputs, max_new_tokens=100) | |
# Decode the output to return the final response | |
response = processor.decode(outputs[0], skip_special_tokens=True) | |
return response | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Image(type="pil", label="Image Input"), # Image input with label | |
gr.Textbox(label="Text Input") # Textbox input with label | |
], | |
outputs=gr.Textbox(label="Generated Response"), # Output with a more descriptive label | |
title="Llama 3.2 11B Vision Instruct Demo", # Title of the interface | |
description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.", # Short description | |
theme="compact" # Using a compact theme for a cleaner look | |
) | |
# Launch the interface | |
interface.launch(debug=True) | |