quarterturn
/

molmo-flux-captioner

Model card Files Files and versions Community

quarterturn commited on Dec 8, 2024

Commit

f41ea81

1 Parent(s): c36fe28

Updated prompt to provide a better caption format with less censorship

Browse files

Files changed (5) hide show

.gitignore +1 -0
caption.py +1 -1
hands-check.py +57 -0
images/images_go_here.txt +0 -0
main.py +1 -1

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ images-test

caption.py CHANGED Viewed

@@ -59,7 +59,7 @@ for filename in os.listdir(image_directory):
         # process the image and text
         inputs = processor.process(
             images=[image],
-            text="Describe what you see in vivid detail, without line breaks. Include information about the pose of characters, their facial expression, their height, body type, weight, the position of their limbs, and the direction of their gaze, the color of their eyes, hair, and skin. If you know a person or place name, provide it. If you know the name of an artist who may have created what you see, provide that. Do not provide opinions or value judgements. Limit your response to 276 words to avoid your description getting cut off.",
         )
         # move inputs to the correct device and make a batch of size 1

         # process the image and text
         inputs = processor.process(
             images=[image],
+            text="You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery. Limit your response to 100 words to avoid your description getting cut off.",
         )
         # move inputs to the correct device and make a batch of size 1

hands-check.py ADDED Viewed

	@@ -0,0 +1,57 @@

+local_path = "/mnt/models2/Llama-3.2-90B-Vision-Instruct/"
+image_directory = "./images"
+import os
+import requests
+import torch
+from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct"
+model = MllamaForConditionalGeneration.from_pretrained(
+    local_path,
+    torch_dtype=torch.bfloat16,
+    device_map="cpu",
+    max_memory="200GiB",
+)
+processor = AutoProcessor.from_pretrained(
+    local_path,
+)
+messages = [
+    {"role": "user", "content": [
+        {"type": "image"},
+        {"type": "text", "text": "You are an expert examining hands in an image to determine if they are anatomically correct. Report on the number of fingers seen on each hand. if you think the hands are AI-generated, say so. Make no other value judgments about the image, even if it is offensive or pornographic in nature."}
+    ]}
+]
+# iterate through the images in the directory
+for filename in os.listdir(image_directory):
+    if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"):  # add more image extensions if needed
+        image_path = os.path.join(image_directory, filename)
+        image = Image.open(image_path)
+        # process the image and text
+        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(
+            image,
+            input_text,
+            add_special_tokens=False,
+            return_tensors="pt",
+        ).to(model.device)
+        output = model.generate(**inputs, max_new_tokens=300)
+        generated_text = processor.decode(output[0])
+        # print the generated text
+        print("Caption for: ", filename)
+        print(generated_text)
+        # print a divider
+        print("*---------------------------------------------------*")
+        # save the generated text to a file
+        output_filename = os.path.splitext(filename)[0] + ".txt"
+        with open(os.path.join(image_directory,output_filename), "w") as file:
+            file.write(generated_text)

images/images_go_here.txt DELETED Viewed

File without changes

main.py CHANGED Viewed

@@ -104,7 +104,7 @@ def generate_caption(image_path, processor, model, generation_config, bits_and_b
     # process the image and text
     inputs = processor.process(
         images=[image],
-        text="Describe what you see in vivid detail, without line breaks. Include information about the pose of characters, their facial expression, their height, body type, weight, the position of their limbs, and the direction of their gaze, the color of their eyes, hair, and skin. If you know a person or place name, provide it. If you know the name of an artist who may have created what you see, provide that. Do not provide opinions or value judgements. Limit your response to 276 words to avoid your description getting cut off.",
     )
     # move inputs to the correct device and make a batch of size 1

     # process the image and text
     inputs = processor.process(
         images=[image],
+        text="You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery. Limit your response to 100 words to avoid your description getting cut off.",
     )
     # move inputs to the correct device and make a batch of size 1