Super-squash branch 'main' using huggingface_hub

Browse files

Files changed (10) hide show

.gitattributes +35 -0
9em124t2-499968/clip_model.pt +3 -0
9em124t2-499968/config.yaml +39 -0
9em124t2-499968/image_adapter.pt +3 -0
9em124t2-499968/text_model/README.md +202 -0
9em124t2-499968/text_model/adapter_config.json +29 -0
9em124t2-499968/text_model/adapter_model.safetensors +3 -0
README.md +79 -0
app.py +381 -0
requirements.txt +10 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

9em124t2-499968/clip_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d7b0548d12fa649370896982c2af9d03d43285b782bd47639c96e6e0b29473c
+size 1713067838

9em124t2-499968/config.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+wandb_project: joy-caption-1
+device_batch_size: 2
+batch_size: 256
+learning_rate: 0.0002
+warmup_samples: 18000
+max_samples: 500000
+save_every: 50000
+test_every: 50000
+use_amp: true
+grad_scaler: true
+lr_scheduler_type: cosine
+min_lr_ratio: 0.0
+allow_tf32: true
+seed: 69
+num_workers: 8
+optimizer_type: adamw
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_eps: 1.0e-08
+adam_weight_decay: 0.0
+clip_grad_norm: 1.0
+dataset: fancyfeast/joy-captioning-20240917a
+clip_model: google/siglip-so400m-patch14-384
+text_model: meta-llama/Meta-Llama-3.1-8B
+resume: null
+gradient_checkpointing: false
+test_size: 2048
+grad_scaler_init: 65536.0
+max_caption_length: 257
+num_image_tokens: 32
+adapter_type: mlp
+text_model_dtype: bfloat16
+pre_test: false
+train_image_model: true
+image_model_lr: null
+train_lora: true
+lora_r: 64
+lora_alpha: 16
+lora_dropout: 0.1

9em124t2-499968/image_adapter.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e53c3bf8df745a3c19ae3c70dbf9bf23cfdc8f3fdb937000a4eafd2a36914661
+size 86067714

9em124t2-499968/text_model/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.12.0

9em124t2-499968/text_model/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

9em124t2-499968/text_model/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b48221de174ab0db7b46b4833118c5c0a4c2bf0b51b77b4cc4ab04651bd06cca
+size 109069176

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+license: mit
+language:
+- en
+---
+# Image Captioning App
+This is a mod of [Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha) and [fancyfeast/joy-caption-alpha-one](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-one). Thanks to [dominic1021](https://huggingface.co/dominic1021).
+# Notice: I will contribute to Wi-zz after shaping the code.
+## Overview
+This application generates descriptive captions for images using advanced ML models. It processes single images or entire directories, leveraging CLIP and LLM models for accurate and contextual captions. It has NSFW captioning support with natural language. This is just an extension of the original author's efforts to improve performance. Their repo is located here: https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-one.
+## Features
+- Single image and batch processing
+- Multiple directory support
+- Custom output directory
+- Adjustable batch size
+- Progress tracking
+## Usage
+| Command | Description |
+|---------|-------------|
+| `python app.py image.jpg` | Process a single image |
+| `python app.py /path/to/directory` | Process all images in a directory |
+| `python app.py /path/to/dir1 /path/to/dir2` | Process multiple directories |
+| `python app.py /path/to/dir --output /path/to/output` | Specify output directory |
+| `python app.py /path/to/dir --bs 8` | Set batch size (default: 4) |
+## Technical Details
+- **Models**: CLIP (vision), LLM (language), custom ImageAdapter
+- **Optimization**: CUDA-enabled GPU support
+- **Error Handling**: Skips problematic images in batch processing
+## Requirements
+- Python 3.x
+- PyTorch
+- Transformers library
+- PEFT library
+- CUDA-capable GPU (recommended)
+## Installation
+Windows
+```bash
+git clone https://huggingface.co/John6666/joy-caption-alpha-one-cli-mod
+cd joy-caption-alpha-one-cli-mod
+python -m venv venv
+.\venv\Scripts\activate
+# Change as per https://pytorch.org/get-started/locally/
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+pip install -r requirements.txt
+```
+Linux
+```bash
+git clone https://huggingface.co/John6666/joy-caption-alpha-one-cli-mod
+cd joy-caption-alpha-one-cli-mod
+python3 -m venv venv
+source venv/bin/activate
+pip3 install torch torchvision torchaudio
+pip3 install -r requirements.txt
+```
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## License
+This project is licensed under the [MIT License](LICENSE).

app.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import torch
+import torch.amp.autocast_mode
+import os
+import sys
+import logging
+import warnings
+import argparse
+from PIL import Image
+from pathlib import Path
+from tqdm import tqdm
+from torch import nn
+from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
+from typing import List, Union
+import torchvision.transforms.functional as TVF
+from peft import PeftConfig
+import gc
+# Constants
+BASE_DIR = Path(__file__).resolve().parent # Define the base directory
+CLIP_PATH = "google/siglip-so400m-patch14-384"
+DEFAULT_MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
+#DEFAULT_MODEL_PATH = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" # Works better but full weight.
+CHECKPOINT_PATH = BASE_DIR / Path("9em124t2-499968")
+LORA_PATH = CHECKPOINT_PATH / "text_model"
+CAPTION_TYPE_MAP = {
+    ("descriptive", "formal", False, False): ["Write a descriptive caption for this image in a formal tone."],
+    ("descriptive", "formal", False, True): ["Write a descriptive caption for this image in a formal tone within {word_count} words."],
+    ("descriptive", "formal", True, False): ["Write a {length} descriptive caption for this image in a formal tone."],
+    ("descriptive", "informal", False, False): ["Write a descriptive caption for this image in a casual tone."],
+    ("descriptive", "informal", False, True): ["Write a descriptive caption for this image in a casual tone within {word_count} words."],
+    ("descriptive", "informal", True, False): ["Write a {length} descriptive caption for this image in a casual tone."],
+    ("training_prompt", "formal", False, False): ["Write a stable diffusion prompt for this image."],
+    ("training_prompt", "formal", False, True): ["Write a stable diffusion prompt for this image within {word_count} words."],
+    ("training_prompt", "formal", True, False): ["Write a {length} stable diffusion prompt for this image."],
+    ("rng-tags", "formal", False, False): ["Write a list of Booru tags for this image."],
+    ("rng-tags", "formal", False, True): ["Write a list of Booru tags for this image within {word_count} words."],
+    ("rng-tags", "formal", True, False): ["Write a {length} list of Booru tags for this image."],
+}
+IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')
+# Global Variables
+IS_NF4 = True
+MODEL_PATH = DEFAULT_MODEL_PATH
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Running on {device}")
+warnings.filterwarnings("ignore", category=UserWarning)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+class ImageAdapter(nn.Module):
+    def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
+        super().__init__()
+        self.deep_extract = deep_extract
+        if self.deep_extract:
+            input_features = input_features * 5
+        self.linear1 = nn.Linear(input_features, output_features)
+        self.activation = nn.GELU()
+        self.linear2 = nn.Linear(output_features, output_features)
+        self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
+        self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
+        # Mode token
+        #self.mode_token = nn.Embedding(n_modes, output_features)
+        #self.mode_token.weight.data.normal_(mean=0.0, std=0.02)   # Matches HF's implementation of llama3
+        # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>)
+        self.other_tokens = nn.Embedding(3, output_features)
+        self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)   # Matches HF's implementation of llama3
+    def forward(self, vision_outputs: torch.Tensor):
+        if self.deep_extract:
+            x = torch.concat((
+                vision_outputs[-2],
+                vision_outputs[3],
+                vision_outputs[7],
+                vision_outputs[13],
+                vision_outputs[20],
+            ), dim=-1)
+            assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"  # batch, tokens, features
+            assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
+        else:
+            x = vision_outputs[-2]
+        x = self.ln1(x)
+        if self.pos_emb is not None:
+            assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
+            x = x + self.pos_emb
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.linear2(x)
+        # Mode token
+        #mode_token = self.mode_token(mode)
+        #assert mode_token.shape == (x.shape[0], mode_token.shape[1], x.shape[2]), f"Expected {(x.shape[0], 1, x.shape[2])}, got {mode_token.shape}"
+        #x = torch.cat((x, mode_token), dim=1)
+        # <|image_start|>, IMAGE, <|image_end|>
+        other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
+        assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
+        x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
+        return x
+    def get_eot_embedding(self):
+        return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
+def load_models():
+    global MODEL_PATH, IS_NF4
+    try:
+        if IS_NF4:
+            from transformers import BitsAndBytesConfig
+            nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
+                                            bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
+            print("Loading in NF4")
+            print("Loading CLIP 📎")
+            clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
+            clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
+            if (CHECKPOINT_PATH / "clip_model.pt").exists():
+                print("Loading VLM's custom vision model 📎")
+                checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
+                checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
+                clip_model.load_state_dict(checkpoint)
+                del checkpoint
+            clip_model.eval().requires_grad_(False).to(device)
+            print("Loading tokenizer 🪙")
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
+            assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
+            print(f"Loading LLM: {MODEL_PATH} 🤖")
+            text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
+            if LORA_PATH.exists():
+                print("Loading VLM's custom text model 🤖")
+                peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device, quantization_config=nf4_config)
+                text_model.add_adapter(peft_config)
+                text_model.enable_adapters()
+            print("Loading image adapter 🖼️")
+            image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
+            image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=False))
+            image_adapter.eval().to(device)
+        else:
+            print("Loading in bfloat16")
+            print("Loading CLIP 📎")
+            clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
+            clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
+            if (CHECKPOINT_PATH / "clip_model.pt").exists():
+                print("Loading VLM's custom vision model 📎")
+                checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
+                checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
+                clip_model.load_state_dict(checkpoint)
+                del checkpoint
+            clip_model.eval().requires_grad_(False).to(device)
+            print("Loading tokenizer 🪙")
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
+            assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
+            print(f"Loading LLM: {MODEL_PATH} 🤖")
+            text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map=device, torch_dtype=torch.bfloat16).eval() # device_map=auto may cause LoRA error
+            if LORA_PATH.exists():
+                print("Loading VLM's custom text model 🤖")
+                peft_config = PeftConfig.from_pretrained(LORA_PATH, device_map=device)
+                text_model.add_adapter(peft_config)
+                text_model.enable_adapters()
+            print("Loading image adapter 🖼️")
+            image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
+            image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=False))
+    except Exception as e:
+        print(f"Error loading models: {e}")
+        sys.exit(1)
+    finally:
+        torch.cuda.empty_cache()
+        gc.collect()
+    return clip_processor, clip_model, tokenizer, text_model, image_adapter
+@torch.inference_mode()
+def stream_chat(input_images: List[Image.Image], caption_type: str, caption_tone: str, caption_length: Union[str, int],
+                max_new_tokens: int, top_p: float, temperature: float, batch_size: int, pbar: tqdm, models: tuple) -> List[str]:
+    global MODEL_PATH
+    clip_processor, clip_model, tokenizer, text_model, image_adapter = models
+    torch.cuda.empty_cache()
+    all_captions = []
+    # 'any' means no length specified
+    length = None if caption_length == "any" else caption_length
+    if isinstance(length, str):
+        try:
+            length = int(length)
+        except ValueError:
+            pass
+    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
+    if caption_type == "rng-tags" or caption_type == "training_prompt":
+        caption_tone = "formal"
+    # Build prompt
+    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+    if prompt_key not in CAPTION_TYPE_MAP:
+        raise ValueError(f"Invalid caption type: {prompt_key}")
+    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
+    print(f"Prompt: {prompt_str}")
+    for i in range(0, len(input_images), batch_size):
+        batch = input_images[i:i+batch_size]
+        # Preprocess image
+        try:
+            all_images = []
+            for input_image in batch:
+                image = input_image.resize((384, 384), Image.LANCZOS)
+                pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+                pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+                all_images.append(TVF.to_pil_image(pixel_values.squeeze()))
+            batch_pixel_values = clip_processor(images=all_images, return_tensors='pt', padding=True).pixel_values.to(device)
+        except ValueError as e:
+            print(f"Error processing image batch: {e}")
+            print("Skipping this batch and continuing...")
+            continue
+        # Embed image
+        with torch.amp.autocast_mode.autocast(device, enabled=True):
+            vision_outputs = clip_model(pixel_values=batch_pixel_values, output_hidden_states=True)
+            image_features = vision_outputs.hidden_states
+            embedded_images = image_adapter(image_features).to(device)
+        # Tokenize the prompt
+        prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
+        # Embed prompt
+        prompt_embeds = text_model.model.embed_tokens(prompt.to(device))
+        assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+        embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
+        eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
+        # Construct prompts
+        inputs_embeds = torch.cat([
+            embedded_bos.expand(embedded_images.shape[0], -1, -1),
+            embedded_images.to(dtype=embedded_bos.dtype),
+            prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+            eot_embed.expand(embedded_images.shape[0], -1, -1),
+        ], dim=1)
+        input_ids = torch.cat([
+            torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
+            torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+            prompt,
+            torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
+        ], dim=1).to(device)
+        attention_mask = torch.ones_like(input_ids)
+        generate_ids = text_model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, do_sample=True,
+                                           suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature)
+        generate_ids = generate_ids[:, input_ids.shape[1]:]
+        for ids in generate_ids:
+            caption = tokenizer.decode(ids[:-1] if ids[-1] == tokenizer.eos_token_id else ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            caption = caption.replace('<|end_of_text|>', '').replace('<|finetune_right_pad_id|>', '').strip()
+            all_captions.append(caption)
+        if pbar:
+            pbar.update(len(batch))
+    return all_captions
+def process_directory(input_dir: Path, output_dir: Path, caption_type: str, caption_tone: str, caption_length: Union[str, int],
+                      max_new_tokens: int, top_p: float, temperature: float, batch_size: int, models: tuple):
+    output_dir.mkdir(parents=True, exist_ok=True)
+    image_files = [f for f in input_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
+    images_to_process = [f for f in image_files if not (output_dir / f"{f.stem}.txt").exists()]
+    if not images_to_process:
+        print("No new images to process.")
+        return
+    with tqdm(total=len(images_to_process), desc="Processing images", unit="image") as pbar:
+        for i in range(0, len(images_to_process), batch_size):
+            batch_files = images_to_process[i:i+batch_size]
+            batch_images = [Image.open(f).convert('RGB') for f in batch_files]
+            captions = stream_chat(batch_images, caption_type, caption_tone, caption_length,
+                                   max_new_tokens, top_p, temperature, batch_size, pbar, models)
+            for file, caption in zip(batch_files, captions):
+                with open(output_dir / f"{file.stem}.txt", 'w', encoding='utf-8') as f:
+                    f.write(caption)
+            for img in batch_images:
+                img.close()
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process images and generate captions.")
+    parser.add_argument("input", nargs='+', help="Input image file or directory (or multiple directories)")
+    parser.add_argument("--output", help="Output directory (optional)")
+    parser.add_argument("--bs", type=int, default=4, help="Batch size (default: 4)")
+    parser.add_argument("--type", type=str, default="descriptive", choices=["descriptive", "training_prompt", "rng-tags"],
+                        help='Caption Type (default: "descriptive")')
+    parser.add_argument("--tone", type=str, default="formal", choices=["formal", "informal"],
+                        help='Caption Tone (default: "formal")')
+    parser.add_argument("--len", default="any",
+                        choices=["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 10)],
+                        help='Caption Length (default: "any")')
+    parser.add_argument("--model", type=str, default=DEFAULT_MODEL_PATH,
+                        help='Huggingface LLM repo (default: "unsloth/Meta-Llama-3.1-8B-bnb-4bit")')
+    parser.add_argument("--bf16", action="store_true", help="Use bfloat16 (default: NF4)")
+    parser.add_argument("--tokens", type=int, default=300, help="Max tokens (default: 300)")
+    parser.add_argument("--topp", type=float, default=0.9, help="Top-P (default: 0.9)")
+    parser.add_argument("--temp", type=float, default=0.6, help="Temperature (default: 0.6)")
+    return parser.parse_args()
+def is_valid_repo(repo_id):
+    from huggingface_hub import HfApi
+    import re
+    try:
+        if not re.fullmatch(r'^[^/,\s\"\']+/[^/,\s\"\']+$', repo_id): return False
+        api = HfApi()
+        if api.repo_exists(repo_id=repo_id): return True
+        else: return False
+    except Exception as e:
+        print(f"Failed to connect {repo_id}. {e}")
+        return False
+def main():
+    global MODEL_PATH, IS_NF4
+    args = parse_arguments()
+    input_paths = [Path(input_path) for input_path in args.input]
+    batch_size = args.bs
+    caption_type = args.type
+    caption_tone = args.tone
+    caption_length = args.len
+    max_new_tokens = args.tokens
+    top_p = args.topp
+    temperature = args.temp
+    if args.bf16: IS_NF4 = False
+    else: IS_NF4 = True
+    if is_valid_repo(args.model): MODEL_PATH = args.model
+    else: sys.exit(1)
+    models = load_models()
+    for input_path in input_paths:
+        if input_path.is_file() and input_path.suffix.lower() in IMAGE_EXTENSIONS:
+            output_path = input_path.with_suffix('.txt')
+            print(f"Processing single image 🎞️: {input_path.name}")
+            with tqdm(total=1, desc="Processing image", unit="image") as pbar:
+                captions = stream_chat([Image.open(input_path).convert('RGB')], caption_type, caption_tone, caption_length,
+                                       max_new_tokens, top_p, temperature, 1, pbar, models)
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    f.write(captions[0])
+            print(f"Output saved to {output_path}")
+        elif input_path.is_dir():
+            output_path = Path(args.output) if args.output else input_path
+            print(f"Processing directory 📁: {input_path}")
+            print(f"Output directory 📦: {output_path}")
+            print(f"Batch size 🗄️: {batch_size}")
+            process_directory(input_path, output_path, caption_type, caption_tone, caption_length,
+                              max_new_tokens, top_p, temperature, batch_size, models)
+        else:
+            print(f"Invalid input: {input_path}")
+            print("Skipping...")
+    if not input_paths:
+        print("Usage:")
+        print("For single image: python app.py [image_file] [--bs batch_size]")
+        print("For directory (same input/output): python app.py [directory] [--bs batch_size]")
+        print("For directory (separate input/output): python app.py [directory] --output [output_directory] [--bs batch_size]")
+        print("For multiple directories: python app.py [directory1] [directory2] ... [--output output_directory] [--bs batch_size]")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+huggingface_hub>=0.23.4
+accelerate
+torch
+transformers==4.44.0
+sentencepiece
+bitsandbytes
+Pillow
+protobuf
+peft==0.12.0
+torchvision