|
import os |
|
import argparse |
|
import torch |
|
from PIL import Image |
|
import requests |
|
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Load and use a quantized model") |
|
parser.add_argument("-q", "--use_quant", action="store_true", help="Use quantized model") |
|
args = parser.parse_args() |
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device("cuda") |
|
print("GPU is available. Using CUDA.") |
|
else: |
|
device = torch.device("cpu") |
|
print("GPU is not available. Using CPU.") |
|
|
|
|
|
local_path = "./model/Molmo-7B-D-0924" |
|
processor = AutoProcessor.from_pretrained( |
|
local_path, |
|
local_files_only=True, |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto' |
|
) |
|
|
|
|
|
if args.use_quant: |
|
|
|
quantized_local_path = "./model/molmo-7B-D-bnb-4bit" |
|
model = AutoModelForCausalLM.from_pretrained( |
|
quantized_local_path, |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto', |
|
) |
|
else: |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
local_path, |
|
trust_remote_code=True, |
|
torch_dtype='auto', |
|
device_map='auto', |
|
) |
|
model.to(dtype=torch.bfloat16) |
|
|
|
|
|
image_directory = "./images" |
|
|
|
|
|
for filename in os.listdir(image_directory): |
|
if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"): |
|
image_path = os.path.join(image_directory, filename) |
|
image = Image.open(image_path) |
|
|
|
|
|
inputs = processor.process( |
|
images=[image], |
|
text="You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery. Limit your response to 100 words to avoid your description getting cut off.", |
|
) |
|
|
|
|
|
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} |
|
inputs["images"] = inputs["images"].to(torch.bfloat16) |
|
|
|
|
|
with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16): |
|
output = model.generate_from_batch( |
|
inputs, |
|
GenerationConfig(max_new_tokens=500, stop_strings="<|endoftext|>"), |
|
tokenizer=processor.tokenizer, |
|
) |
|
|
|
|
|
generated_tokens = output[0, inputs["input_ids"].size(1) :] |
|
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) |
|
|
|
|
|
print("Caption for: ", filename) |
|
print(generated_text) |
|
|
|
print("*---------------------------------------------------*") |
|
|
|
|
|
output_filename = os.path.splitext(filename)[0] + ".txt" |
|
with open(os.path.join(image_directory,output_filename), "w") as file: |
|
file.write(generated_text) |
|
|