|
import os |
|
import argparse |
|
import torch |
|
from PIL import Image |
|
import requests |
|
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Load and use a quantized model") |
|
parser.add_argument("-q", "--use_quant", action="store_true", help="Use quantized model") |
|
args = parser.parse_args() |
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device("cuda") |
|
print("GPU is available. Using CUDA.") |
|
else: |
|
device = torch.device("cpu") |
|
print("GPU is not available. Using CPU.") |
|
|
|
|
|
local_path = "./model/Molmo-7B-D-0924" |
|
processor = AutoProcessor.from_pretrained( |
|
local_path, |
|
local_files_only=True, |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto' |
|
) |
|
|
|
|
|
if args.use_quant: |
|
|
|
quantized_local_path = "./model/molmo-7B-D-bnb-4bit" |
|
model = AutoModelForCausalLM.from_pretrained( |
|
quantized_local_path, |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto', |
|
) |
|
else: |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
local_path, |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto', |
|
) |
|
model.to(dtype=torch.bfloat16) |
|
|
|
|
|
image_directory = "./images" |
|
|
|
|
|
for filename in os.listdir(image_directory): |
|
if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"): |
|
image_path = os.path.join(image_directory, filename) |
|
image = Image.open(image_path) |
|
|
|
|
|
inputs = processor.process( |
|
images=[image], |
|
text="Describe what you see in vivid detail, without line breaks. Include information about the pose of characters, their facial expression, their height, body type, weight, the position of their limbs, and the direction of their gaze, the color of their eyes, hair, and skin. If you know a person or place name, provide it. If you know the name of an artist who may have created what you see, provide that. Do not provide opinions or value judgements. Limit your response to 276 words to avoid your description getting cut off.", |
|
) |
|
|
|
|
|
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} |
|
inputs["images"] = inputs["images"].to(torch.bfloat16) |
|
|
|
|
|
with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16): |
|
output = model.generate_from_batch( |
|
inputs, |
|
GenerationConfig(max_new_tokens=500, stop_strings="<|endoftext|>"), |
|
tokenizer=processor.tokenizer, |
|
) |
|
|
|
|
|
generated_tokens = output[0, inputs["input_ids"].size(1) :] |
|
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) |
|
|
|
|
|
print("Caption for: ", filename) |
|
print(generated_text) |
|
|
|
print("*---------------------------------------------------*") |
|
|
|
|
|
output_filename = os.path.splitext(filename)[0] + ".txt" |
|
with open(os.path.join(image_directory,output_filename), "w") as file: |
|
file.write(generated_text) |
|
|