import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

# Define the model ID and load the model and processor
model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct"

def load_model():
    """Loads the Llama 3.2-90B Vision-Instruct model and processor."""
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained(model_id)
    return model, processor

def process_image(url):
    """Processes the image from the given URL."""
    image = Image.open(requests.get(url, stream=True).raw)
    return image

def generate_response(model, processor, image, prompt):
    """Generates a text response based on the image and the prompt."""
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(image, input_text, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=30)
    return processor.decode(output[0])

def main():
    # Load model and processor
    model, processor = load_model()
    
    # Sample image URL
    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
    image = process_image(url)
    
    # Define a sample prompt
    prompt = "If I had to write a haiku for this one, it would be:"
    
    # Generate response
    response = generate_response(model, processor, image, prompt)
    print(response)

if __name__ == "__main__":
    main()