LlavaMistral1 / multi_script.py
lorocksUMD's picture
Update multi_script.py
7fe1960 verified
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import AutoTokenizer
from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM
from llava.model.builder import load_pretrained_model
from llava.mm_utils import (
process_images,
tokenizer_image_token,
get_model_name_from_path,
)
from llava.constants import (
IMAGE_TOKEN_INDEX,
DEFAULT_IMAGE_TOKEN,
DEFAULT_IM_START_TOKEN,
DEFAULT_IM_END_TOKEN,
IMAGE_PLACEHOLDER,
)
from llava.conversation import conv_templates, SeparatorStyle
import argparse
import torch
import requests
from PIL import Image
from io import BytesIO
import re
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-mistral-7b")
parser.add_argument("--image-file", type=str, required=True)
parser.add_argument("--inference-type", type=str, default="auto")
parser.add_argument("--prompt", type=str, default="Explain this image")
cmd_args = parser.parse_args()
# Line 138 uncomment the cuda() to use GPUs
# device = "cpu"
device = cmd_args.inference_type
prompt = cmd_args.prompt
image_file = cmd_args.image_file
model_path = cmd_args.model_path
# Functions for inference
def image_parser(args):
out = args.image_file.split(args.sep)
return out
def load_image(image_file):
if image_file.startswith("http") or image_file.startswith("https"):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert("RGB")
else:
image = Image.open(image_file).convert("RGB")
return image
def load_images(image_files):
out = []
for image_file in image_files:
image = load_image(image_file)
out.append(image)
return out
model_name = get_model_name_from_path('llava-v1.6-mistral-7b')
args = type('Args', (), {
"model_path": model_path,
"model_base": None,
"model_name": model_name,
"query": prompt,
"conv_mode": None,
"image_file": image_file,
"sep": ",",
"temperature": 0,
"top_p": None,
"num_beams": 1,
"max_new_tokens": 512
})()
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path, None, model_name, device_map=device
)
qs = args.query
image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
if IMAGE_PLACEHOLDER in qs:
if model.config.mm_use_im_start_end:
qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
else:
qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
else:
if model.config.mm_use_im_start_end:
qs = image_token_se + "\n" + qs
else:
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
if "llama-2" in model_name.lower():
conv_mode = "llava_llama_2"
elif "mistral" in model_name.lower():
conv_mode = "mistral_instruct"
elif "v1.6-34b" in model_name.lower():
conv_mode = "chatml_direct"
elif "v1" in model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
conv_mode = "mpt"
else:
conv_mode = "llava_v0"
if args.conv_mode is not None and conv_mode != args.conv_mode:
print(
"[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
conv_mode, args.conv_mode, args.conv_mode
)
)
else:
args.conv_mode = conv_mode
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
if ".jpg" in image_file or ".png" in image_file:
image_files = image_parser(args)
else:
import glob
import os
image_ext = ("*.png", '*.jpg')
image_files = []
for ext in image_ext:
image_files.extend(glob.glob(os.path.join(image_file, ext)))
images = load_images(image_files)
image_sizes = [x.size for x in images]
images_tensor = process_images(
images,
image_processor,
model.config
).to(model.device, dtype=torch.float16)
input_ids = (
tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
# .cuda()
)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images_tensor,
image_sizes=image_sizes,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=args.max_new_tokens,
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
if "dataset1" in image_file:
print("Num of words: ", len(outputs))
elif "dataset2" in image_file:
print()
else:
print("Is single word?", len((outputs).split()) == 1)
print(outputs)
# End Llava inference