infly/InfMLLM2_7B_chat · sample script

Is this basically correct?
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModel
from torchvision import transforms
import numpy as np

def process_image(image_path):
    # Load image
    image = Image.open(image_path).convert('RGB')
    
    # The processing will be handled by the model's built-in CLIPVisionTowerHD
    # We just need to provide the PIL image
    return image

def ask_about_image(image_path, question="What does this image depict?"):
    # Load model and tokenizer
    model_path = "infly/InfMLLM2_7B_chat"
    model = AutoModel.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    ).cuda().eval()
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        trust_remote_code=True
    )

    # Process image
    image = process_image(image_path)
    
    # Create conversation history
    history = [
        {
            'from': 'human',
            'value': f"<|image|>{question}"
        },
        {
            'from': 'gpt',
            'value': ""
        }
    ]

    # Create samples dict as shown in the source code
    samples = {
        'images': [image],  # The model will process the image internally
        'conversations': [history]
    }

    # Generate response
    with torch.inference_mode():
        responses, _ = model.generate(
            samples=samples,
            max_length=512,
            num_beams=1,
            top_p=0.9,
            temperature=0.7,
            return_prompts=True
        )
    
    return responses[0]

if __name__ == "__main__":
    image_path = "path/to/your/image.jpg"
    response = ask_about_image(image_path)
    print("Model's response:", response)