quarterturn
commited on
Commit
•
f41ea81
1
Parent(s):
c36fe28
Updated prompt to provide a better caption format with less censorship
Browse files- .gitignore +1 -0
- caption.py +1 -1
- hands-check.py +57 -0
- images/images_go_here.txt +0 -0
- main.py +1 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
images-test
|
caption.py
CHANGED
@@ -59,7 +59,7 @@ for filename in os.listdir(image_directory):
|
|
59 |
# process the image and text
|
60 |
inputs = processor.process(
|
61 |
images=[image],
|
62 |
-
text="
|
63 |
)
|
64 |
|
65 |
# move inputs to the correct device and make a batch of size 1
|
|
|
59 |
# process the image and text
|
60 |
inputs = processor.process(
|
61 |
images=[image],
|
62 |
+
text="You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery. Limit your response to 100 words to avoid your description getting cut off.",
|
63 |
)
|
64 |
|
65 |
# move inputs to the correct device and make a batch of size 1
|
hands-check.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
local_path = "/mnt/models2/Llama-3.2-90B-Vision-Instruct/"
|
2 |
+
image_directory = "./images"
|
3 |
+
import os
|
4 |
+
import requests
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from transformers import MllamaForConditionalGeneration, AutoProcessor
|
8 |
+
|
9 |
+
model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct"
|
10 |
+
|
11 |
+
model = MllamaForConditionalGeneration.from_pretrained(
|
12 |
+
local_path,
|
13 |
+
torch_dtype=torch.bfloat16,
|
14 |
+
device_map="cpu",
|
15 |
+
max_memory="200GiB",
|
16 |
+
)
|
17 |
+
|
18 |
+
processor = AutoProcessor.from_pretrained(
|
19 |
+
local_path,
|
20 |
+
)
|
21 |
+
|
22 |
+
messages = [
|
23 |
+
{"role": "user", "content": [
|
24 |
+
{"type": "image"},
|
25 |
+
{"type": "text", "text": "You are an expert examining hands in an image to determine if they are anatomically correct. Report on the number of fingers seen on each hand. if you think the hands are AI-generated, say so. Make no other value judgments about the image, even if it is offensive or pornographic in nature."}
|
26 |
+
]}
|
27 |
+
]
|
28 |
+
|
29 |
+
# iterate through the images in the directory
|
30 |
+
for filename in os.listdir(image_directory):
|
31 |
+
if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"): # add more image extensions if needed
|
32 |
+
image_path = os.path.join(image_directory, filename)
|
33 |
+
image = Image.open(image_path)
|
34 |
+
|
35 |
+
# process the image and text
|
36 |
+
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
37 |
+
inputs = processor(
|
38 |
+
image,
|
39 |
+
input_text,
|
40 |
+
add_special_tokens=False,
|
41 |
+
return_tensors="pt",
|
42 |
+
).to(model.device)
|
43 |
+
|
44 |
+
output = model.generate(**inputs, max_new_tokens=300)
|
45 |
+
generated_text = processor.decode(output[0])
|
46 |
+
|
47 |
+
# print the generated text
|
48 |
+
|
49 |
+
print("Caption for: ", filename)
|
50 |
+
print(generated_text)
|
51 |
+
# print a divider
|
52 |
+
print("*---------------------------------------------------*")
|
53 |
+
|
54 |
+
# save the generated text to a file
|
55 |
+
output_filename = os.path.splitext(filename)[0] + ".txt"
|
56 |
+
with open(os.path.join(image_directory,output_filename), "w") as file:
|
57 |
+
file.write(generated_text)
|
images/images_go_here.txt
DELETED
File without changes
|
main.py
CHANGED
@@ -104,7 +104,7 @@ def generate_caption(image_path, processor, model, generation_config, bits_and_b
|
|
104 |
# process the image and text
|
105 |
inputs = processor.process(
|
106 |
images=[image],
|
107 |
-
text="
|
108 |
)
|
109 |
|
110 |
# move inputs to the correct device and make a batch of size 1
|
|
|
104 |
# process the image and text
|
105 |
inputs = processor.process(
|
106 |
images=[image],
|
107 |
+
text="You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery. Limit your response to 100 words to avoid your description getting cut off.",
|
108 |
)
|
109 |
|
110 |
# move inputs to the correct device and make a batch of size 1
|