changes

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

Files changed (5) hide show

.zshrc +85 -0
a2mp4 +142 -0
debug_emoji +8 -0
txt2emoji +76 -0
write_dataset_cfg_combined.py +90 -0

.zshrc CHANGED Viewed

@@ -1,3 +1,87 @@
 export GIN_MODE=release
 export NODE_ENV=production
@@ -191,6 +275,7 @@ alias gcs='git clone --recurse-submodules'
 # Alias for running the Grabber-cli command
 alias grabber="Grabber-cli"
 # 'pie' is a shortcut for installing a Python package in editable mode
 # using the pip command with the --use-pep517 option.
 alias pie='pip install -e . --use-pep517'

+alias wordfreq='find ${1:-.} -type f -name "*.txt" -exec cat {} + | tr "," " " | tr " " "\n" | sort | uniq -c | sort -nr | less'
+alias wq='wordfreq'
+alias 🐺="ollama serve & conda activate openwebui && open-webui serve --port 6969"
+alias gsa='git submodule add'
+deleteorphantags() {
+  # Loop through all .tags files
+  for tag_file in **/*.tags; do
+    # Check if any corresponding image file exists
+    base_name="${tag_file%.tags}"
+    if [[ ! -f "${base_name}.png" && ! -f "${base_name}.jpg" && ! -f "${base_name}.jpeg" && ! -f "${base_name}.jxl" && ! -f "${base_name}.webp" ]]; then
+      # If no image file exists, delete the .tags file
+      rm -v "$tag_file"
+    fi
+  done
+}
+deleteorphantxt() {
+  # Loop through all .txt files
+  for txt_file in **/*.txt; do
+    # Check if any corresponding image file exists
+    base_name="${txt_file%.tags}"
+    if [[ ! -f "${base_name}.png" && ! -f "${base_name}.jpg" && ! -f "${base_name}.jpeg" && ! -f "${base_name}.jxl" && ! -f "${base_name}.webp" ]]; then
+      # If no image file exists, delete the .txt file
+      rm -v "$txt_file"
+    fi
+  done
+}
+look4orphantags() {
+  # Loop through all .tags files
+  for tag_file in **/*.tags; do
+    # Check if any corresponding image file exists
+    base_name="${tag_file%.tags}"
+    if [[ ! -f "${base_name}.png" && ! -f "${base_name}.jpg" && ! -f "${base_name}.jpeg" && ! -f "${base_name}.jxl" && ! -f "${base_name}.webp" ]]; then
+      # If no image file exists, print the .tags file
+      echo "$tag_file"
+    fi
+  done
+}
+look4orphantxt() {
+  # Loop through all .txt files
+  for txt_file in **/*.txt; do
+    # Check if any corresponding image file exists
+    base_name="${txt_file%.tags}"
+    if [[ ! -f "${base_name}.png" && ! -f "${base_name}.jpg" && ! -f "${base_name}.jpeg" && ! -f "${base_name}.jxl" && ! -f "${base_name}.webp" ]]; then
+      # If no image file exists, print the .txt file
+      echo "$txt_file"
+    fi
+  done
+}
+check4sig() {
+    target_dir="$1"
+    if [[ -z "$target_dir" ]]; then
+        echo "Please provide a target directory."
+        return 1
+    fi
+    if [[ ! -d "$target_dir" ]]; then
+        echo "The provided target directory does not exist."
+        return 1
+    fi
+    found_files=()
+    for file in "$target_dir"/*.caption; do
+        if [[ -f "$file" ]]; then
+            if grep -q -e "signature" -e "watermark" "$file"; then
+                found_files+=("$file")
+            fi
+        fi
+    done
+    if [[ ${#found_files[@]} -eq 0 ]]; then
+        echo "No 'signature' or 'watermark' found in any .caption files."
+    else
+        echo "Opening files in nvim: ${found_files[@]}"
+        nvim "${found_files[@]}"
+    fi
+}
+alias fuckoff="conda deactivate && rconda"
 export GIN_MODE=release
 export NODE_ENV=production
 # Alias for running the Grabber-cli command
 alias grabber="Grabber-cli"
+alias pi='pip install'
 # 'pie' is a shortcut for installing a Python package in editable mode
 # using the pip command with the --use-pep517 option.
 alias pie='pip install -e . --use-pep517'

a2mp4 ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import shutil
+import subprocess
+import glob
+import re
+from pathlib import Path
+def create_video(input_dir, sample, temp_dir, step_multiplier, repeat, max_images):
+    output_filename = f"{os.path.basename(os.getcwd())}_sample{sample}.mp4"
+    print(f"Processing sample {sample}. Output filename: {output_filename}")
+    # Create repeated images
+    print(f"Creating repeated images for sample {sample}...")
+    for img in glob.glob(f"{input_dir}/{sample}_*.png"):
+        for i in range(repeat):
+            base = os.path.splitext(os.path.basename(img))[0]
+            shutil.copy(img, f"{temp_dir}/{base}_{i+1}.png")
+    # Prepare ffmpeg options
+    vf_options = "scale=1024x1024"
+    if step_multiplier:
+        vf_options += f",drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\\: %{{expr\\:trunc(n*{step_multiplier}/{repeat})}}':x=10:y=h-th-10:fontsize=24:fontcolor=white"
+    if max_images:
+        vf_options = f"select='not(mod(n\\,{max_images}))',{vf_options}"
+    # Run first ffmpeg command
+    temp_output = f"{temp_dir}/temp_{sample}.mp4"
+    ffmpeg_cmd = [
+        "ffmpeg", "-framerate", "60",
+        "-pattern_type", "glob", "-i", f"{temp_dir}/{sample}_*.png",
+        "-vf", vf_options,
+        "-crf", "18", "-c:v", "libx264", "-b:v", "12M",
+        "-pix_fmt", "yuv420p", "-y", temp_output
+    ]
+    try:
+        subprocess.run(ffmpeg_cmd, check=True)
+    except subprocess.CalledProcessError:
+        print(f"Error: ffmpeg command failed for sample {sample}.")
+        return False
+    # Get duration and process final video
+    try:
+        duration_cmd = ["ffmpeg", "-i", temp_output]
+        result = subprocess.run(duration_cmd, capture_output=True, text=True)
+        duration_match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2})', result.stderr)
+        if duration_match:
+            hours, minutes, seconds = map(float, duration_match.groups())
+            duration = hours * 3600 + minutes * 60 + seconds
+            final_cmd = [
+                "ffmpeg", "-i", temp_output,
+                "-vf", "tpad=stop_mode=clone:stop_duration=8",
+                "-c:v", "libx264", "-b:v", "12M", "-crf", "18",
+                "-pix_fmt", "yuv420p", "-y", output_filename
+            ]
+            subprocess.run(final_cmd, check=True)
+        else:
+            print("Error: Could not determine video duration.")
+            return False
+    except subprocess.CalledProcessError:
+        print(f"Error: Final ffmpeg processing failed for sample {sample}.")
+        return False
+    # Clean up temporary files for this sample
+    for f in glob.glob(f"{temp_dir}/{sample}_*.png"):
+        os.remove(f)
+    os.remove(temp_output)
+    return True
+def get_step_size_from_filenames(sample):
+    files = sorted(glob.glob(f"{sample}_*.png"))
+    if len(files) < 2:
+        return None
+    # Extract step numbers from first two files
+    pattern = r'_(\d{5})_'
+    first_match = re.search(pattern, files[0])
+    second_match = re.search(pattern, files[1])
+    if first_match and second_match:
+        first_step = int(first_match.group(1))
+        second_step = int(second_match.group(1))
+        return second_step - first_step
+    return None
+def main():
+    parser = argparse.ArgumentParser(description='Convert PNG sequence to MP4')
+    parser.add_argument('--max', type=int, help='Maximum number of images')
+    parser.add_argument('--step', type=int, help='Step multiplier')
+    parser.add_argument('--repeat', type=int, default=1, help='Repeat count')
+    parser.add_argument('--steps-from-filename', action='store_true', help='Calculate steps from filename')
+    args = parser.parse_args()
+    # Create temporary directory
+    temp_dir = os.path.expanduser("~/.local/tmp")
+    os.makedirs(temp_dir, exist_ok=True)
+    print("Created temporary directory...")
+    # Check for PNG files
+    png_files = glob.glob("*.png")
+    if not png_files:
+        print("Error: No PNG files found in the current directory.")
+        return 1
+    # Find all unique sample numbers
+    sample_pattern = r'([a-zA-Z]+)_\d{5}_'
+    samples = sorted(set(re.findall(sample_pattern, ' '.join(png_files))))
+    for sample in samples:
+        if args.steps_from_filename:
+            step_multiplier = get_step_size_from_filenames(sample)
+            if step_multiplier:
+                print(f"Detected step size: {step_multiplier}")
+            else:
+                print("Error: Could not determine step size from filenames")
+                continue
+        else:
+            step_multiplier = args.step
+        success = create_video(".", sample, temp_dir, step_multiplier, args.repeat, args.max)
+        if not success:
+            shutil.rmtree(temp_dir)
+            return 1
+    # Clean up
+    print("Cleaning up temporary directory...")
+    shutil.rmtree(temp_dir)
+    print("All samples processed successfully.")
+    return 0
+if __name__ == "__main__":
+    exit(main())

debug_emoji ADDED Viewed

	@@ -0,0 +1,8 @@

+import emoji
+def print_all_emojis():
+    for em in emoji.EMOJI_DATA:
+        print(em)
+print_all_emojis()

txt2emoji ADDED Viewed

	@@ -0,0 +1,76 @@

+import nltk
+from nltk.tokenize import word_tokenize
+from emoji import EMOJI_DATA
+# Download required NLTK data (only needed once)
+nltk.download('punkt', quiet=True)
+def get_emoji_mapping():
+    """Create a mapping of words to emojis."""
+    emoji_map = {}
+    for emoji_char, data in EMOJI_DATA.items():
+        if 'en' in data:  # If emoji has English description
+            words = data['en'].lower().replace('_', ' ').split()
+            for word in words:
+                if word not in emoji_map:
+                    emoji_map[word] = []
+                emoji_map[word].append(emoji_char)
+    return emoji_map
+def text_to_emojis(text):
+    """Convert text to related emojis."""
+    # Create emoji mapping
+    emoji_map = get_emoji_mapping()
+    # Additional manual mappings for common words
+    custom_mappings = {
+        'love': '❤️',
+        'cat': '😺',
+        'cats': '😺',
+        'dog': '🐶',
+        'dogs': '🐶',
+        'sun': '☀️',
+        'moon': '🌙',
+        'star': '⭐',
+        'happy': '😊',
+        'sad': '😢',
+        'angry': '😠',
+        'food': '🍔',
+        'heart': '❤️',
+        'fire': '🔥',
+        'hot': '🔥',
+        'cold': '❄️',
+        'snow': '❄️',
+        'rain': '🌧️',
+        'smile': '😊',
+        'laugh': '😂',
+        'cry': '😢',
+    }
+    # Tokenize the input text
+    tokens = word_tokenize(text.lower())
+    # Store found emojis
+    found_emojis = []
+    # Process each token
+    for token in tokens:
+        # First check custom mappings
+        if token in custom_mappings:
+            found_emojis.append(custom_mappings[token])
+            continue
+        # Then check emoji mapping
+        if token in emoji_map:
+            found_emojis.append(emoji_map[token][0])  # Take first matching emoji
+    # Return emojis as comma-separated string
+    return ' '.join(found_emojis) if found_emojis else ''
+# Example usage
+if __name__ == "__main__":
+    # Test the function
+    sample_text = "I love cats and dogs. The sun is shining!"
+    result = text_to_emojis(sample_text)
+    print(f"Input text: {sample_text}")
+    print(f"Emojis: {result}")

write_dataset_cfg_combined.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/bin/env python
+import os
+import toml
+import sys
+from pathlib import Path
+from collections import defaultdict, Counter
+from pprint import pprint
+def update_config(root_dir):
+    root_dir = Path(root_dir).resolve()
+    config_path = root_dir / "config.toml"
+    config = toml.load(config_path)
+    stats = defaultdict(Counter)
+    new_subsets = []
+    for dataset_path in root_dir.iterdir():
+        if not dataset_path.is_dir() or dataset_path.name[0] == '.':
+            continue
+        for subset_path in dataset_path.iterdir():
+            subset_name = subset_path.name
+            subset_path = dataset_path / subset_path
+            if not subset_path.is_dir() or subset_name[0] == '.':
+                continue
+            # Collect the dataset information for the config.toml
+            try:
+                num_repeats = int(subset_name.partition('_')[0])
+            except ValueError:
+                num_repeats = 1
+            new_subsets.append({
+                "image_dir": str(subset_path),
+                "num_repeats": num_repeats
+            })
+            # Accumulate statistics for each subset
+            # First collect the extensions of the files in the subset
+            data_files = defaultdict(set)
+            for file in subset_path.iterdir():
+                ext = file.suffix
+                if ext not in {'.txt', '.tags',  '.caption', '.txt', '.jxl', '.jpg', '.jpeg', '.png', '.json'}:
+                    continue
+                stem = file.stem.partition('.')[0]
+                if stem == 'sample-prompts':
+                    continue
+                data_files[stem].add(ext)
+            # Classify the files in the subset
+            subset_stats = stats[subset_path]
+            for stem, exts in data_files.items():
+                has_caption = bool({'.txt', '.caption', 'caption', '.tags'} & exts)
+                has_image = bool({'.jpg', '.jpeg', '.png', '.jxl'} & exts)
+                if has_caption and has_image:
+                    subset_stats["captioned"] += 1
+                elif has_image:
+                    subset_stats["no_caption"] += 1
+                elif has_caption:
+                    subset_stats["orphans"] += 1
+                    if 'DELETE_ORPHANS' in os.environ:
+                        print(f"Deleting orphan {subset_path / f'{stem}{ext}'}")
+                        if not 'DEBUG' in os.environ:
+                            for ext in exts:
+                                (subset_path / f"{stem}{ext}").unlink()
+                                raise NotImplementedError("UNFINISHED DO NOT USE")
+                else:
+                    if '.toml' not in exts:
+                        for ext in exts:
+                            subset_stats[ext] += 1
+    # Edit the config.toml
+    config["datasets"][0]["subsets"] = new_subsets
+    if "DEBUG" in os.environ:
+        print(toml.dumps(config))
+    else:
+        with open(config_path, "w") as f:
+            toml.dump(config, f)
+    return stats
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: [DEBUG=1] [DELETE_ORPHANS=1] python script.py <ROOT_DIR>")
+        sys.exit(1)
+    root_dir = sys.argv[1]
+    stats = update_config(root_dir)
+    # Print statistics for each subset
+    for subset, subset_stats in sorted(stats.items(), key=lambda x: x[0]):
+        print(subset, dict(subset_stats))