import requests import torch from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor # Define the model ID and load the model and processor model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct" def load_model(): """Loads the Llama 3.2-90B Vision-Instruct model and processor.""" model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) return model, processor def process_image(url): """Processes the image from the given URL.""" image = Image.open(requests.get(url, stream=True).raw) return image def generate_response(model, processor, image, prompt): """Generates a text response based on the image and the prompt.""" messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": prompt} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(image, input_text, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=30) return processor.decode(output[0]) def main(): # Load model and processor model, processor = load_model() # Sample image URL url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" image = process_image(url) # Define a sample prompt prompt = "If I had to write a haiku for this one, it would be:" # Generate response response = generate_response(model, processor, image, prompt) print(response) if __name__ == "__main__": main()