sample script
#1
by
ctranslate2-4you
- opened
Is this basically correct?
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModel
from torchvision import transforms
import numpy as np
def process_image(image_path):
# Load image
image = Image.open(image_path).convert('RGB')
# The processing will be handled by the model's built-in CLIPVisionTowerHD
# We just need to provide the PIL image
return image
def ask_about_image(image_path, question="What does this image depict?"):
# Load model and tokenizer
model_path = "infly/InfMLLM2_7B_chat"
model = AutoModel.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16
).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True
)
# Process image
image = process_image(image_path)
# Create conversation history
history = [
{
'from': 'human',
'value': f"<|image|>{question}"
},
{
'from': 'gpt',
'value': ""
}
]
# Create samples dict as shown in the source code
samples = {
'images': [image], # The model will process the image internally
'conversations': [history]
}
# Generate response
with torch.inference_mode():
responses, _ = model.generate(
samples=samples,
max_length=512,
num_beams=1,
top_p=0.9,
temperature=0.7,
return_prompts=True
)
return responses[0]
if __name__ == "__main__":
image_path = "path/to/your/image.jpg"
response = ask_about_image(image_path)
print("Model's response:", response)