k4d3 commited on Oct 8

Commit

838305c

•

2 Parent(s): 13ab065 988d14b

Merge branch 'main' of hf.co:/k4d3/toolkit

Browse files

Files changed (23) hide show

.zshrc +86 -56
audiogen_medium.py +18 -0
{9em124t2-499968 → cgrkzexw-599808}/clip_model.pt +1 -1
{9em124t2-499968 → cgrkzexw-599808}/config.yaml +3 -3
cgrkzexw-599808/image_adapter.pt +3 -0
{9em124t2-499968 → cgrkzexw-599808}/text_model/README.md +1 -1
{9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_config.json +8 -3
{9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_model.safetensors +2 -2
cgrkzexw-599808/text_model/special_tokens_map.json +23 -0
cgrkzexw-599808/text_model/tokenizer.json +0 -0
cgrkzexw-599808/text_model/tokenizer_config.json +2064 -0
crawl/crawl +3 -3
crawl/crawl_wikipedia +2 -2
joy +455 -272
keyframe +0 -0
ogg2wav +38 -0
paper-qa.code-workspace +0 -11
remove_extra_whitespace +60 -0
remove_tag +33 -0
stats +108 -0
whisper +49 -0
whisper2 +72 -0
zsh/png2mp4.zsh +172 -0

.zshrc CHANGED Viewed

@@ -8,6 +8,20 @@
 #    - conda-env: Adds support for Conda environment management
 # 4. Set the custom theme for the shell prompt
 # Load the custom git wrapper script
 source $HOME/toolkit/git-wrapper.zsh
@@ -70,6 +84,30 @@ export OMP_NUM_THREADS=24
 # However, it may slightly reduce performance in some scenarios where parallel tokenization is beneficial
 export TOKENIZERS_PARALLELISM=false
 # Source the Oh My Zsh script
 # This line loads Oh My Zsh, a popular framework for managing Zsh configuration
 # It sets up various features like themes, plugins, and custom functions
@@ -156,30 +194,6 @@ alias rl="source ~/.zshrc"
 # Alias for quickly editing and reloading the zsh configuration file
 alias ezc="nvim ~/.zshrc && source ~/.zshrc"
-# Source the broot launcher script for enhanced file navigation
-source /home/kade/.config/broot/launcher/bash/br
-# Source the fzf (Fuzzy Finder) configuration for zsh if it exists
-# This enables fzf functionality in the shell, including keybindings and auto-completion
-[ -f ~/.fzf.zsh ] && source ~/.fzf.zsh
-# >>> conda initialize >>>
-# !! Contents within this block are managed by 'conda init' !!
-__conda_setup="$('/home/kade/miniconda3/bin/conda' 'shell.zsh' 'hook' 2> /dev/null)"
-if [ $? -eq 0 ]; then
-    eval "$__conda_setup"
-else
-    if [ -f "/home/kade/miniconda3/etc/profile.d/conda.sh" ]; then
-        . "/home/kade/miniconda3/etc/profile.d/conda.sh"
-    else
-        export PATH="/home/kade/miniconda3/bin:$PATH"
-    fi
-fi
-unset __conda_setup
-# <<< conda initialize <<<
-unset CONDA_CHANGEPS1
 display_git_help() {
   echo "Git"
   echo "---"
@@ -769,6 +783,52 @@ chop_lora() {
     done
 }
 # Function: swch (Switch Git Branch)
 # Description:
 #   This function facilitates switching between Git branches while ensuring a clean working directory.
@@ -857,34 +917,8 @@ import safetensors, json
 filePath = '$filePath'
 print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
 }
-# Function: png2mp4
-# Description:
-#   This function converts a series of PNG images into an MP4 video file using ffmpeg.
-#
-# Usage:
-#   png2mp4
-#
-# Parameters:
-#   None (uses all PNG files in the current directory)
-#
-# Actions:
-#   1. Sets the frame rate to 8 fps
-#   2. Uses glob pattern to include all PNG files in the current directory
-#   3. Scales the output video to 1024x1024 resolution
-#   4. Sets the Constant Rate Factor (CRF) to 28 for good compression
-#   5. Uses the libx264 codec for H.264 encoding
-#   6. Sets the pixel format to yuv420p for compatibility
-#   7. Outputs the result as 'out.mp4' in the current directory
-#
-# Notes:
-#   - Requires ffmpeg to be installed and accessible in the system path
-#   - PNG files should be in the current directory
-#   - Output video will be named 'out.mp4' and placed in the current directory
-#   - Adjust the framerate, scale, or CRF value as needed for different results
-png2mp4() {
-  ffmpeg -framerate 8 -pattern_type glob -i '*.png' -vf scale=1024x1024 -crf 28 \
-  -c:v libx264 -pix_fmt yuv420p out.mp4
-}
 # Function: c
 # Description:
@@ -920,10 +954,6 @@ c() {
     conda activate comfyui
     python main.py --listen 0.0.0.0 --preview-method taesd --use-pytorch-cross-attention --disable-xformers --front-end-version Comfy-Org/ComfyUI_frontend@latest --fast
 }
-# Function: conda_prompt_info
-# Description:
-#   This function displays information about the currently active Conda environment.
 #
 # Usage:
 #   conda_prompt_info

 #    - conda-env: Adds support for Conda environment management
 # 4. Set the custom theme for the shell prompt
+# The `export QT_QPA_PLATFORM=offscreen` command is used to set the `QT_QPA_PLATFORM`
+# environment variable to `offscreen`. This is particularly useful when running Qt
+# applications in a headless environment, such as a server or a CI/CD pipeline,
+# where there is no display server available. By setting this variable, Qt
+# applications can render their graphical output offscreen, allowing them to
+# run without requiring a graphical user interface (GUI). This is commonly used for
+# automated testing, rendering, or other tasks that do not require user interaction.
+export QT_QPA_PLATFORM=offscreen
+# Enable the experimental Just-In-Time (JIT) compiler for Python 3.13.
+# This can improve performance by compiling Python code to machine code at runtime.
+# Note: The JIT is only available for x86_64 builds of Python in conda[^1^][1].
+export PYTHON_JIT=1
 # Load the custom git wrapper script
 source $HOME/toolkit/git-wrapper.zsh
 # However, it may slightly reduce performance in some scenarios where parallel tokenization is beneficial
 export TOKENIZERS_PARALLELISM=false
+# Source the broot launcher script for enhanced file navigation
+source /home/kade/.config/broot/launcher/bash/br
+# Source the fzf (Fuzzy Finder) configuration for zsh if it exists
+# This enables fzf functionality in the shell, including keybindings and auto-completion
+[ -f ~/.fzf.zsh ] && source ~/.fzf.zsh
+# >>> conda initialize >>>
+# !! Contents within this block are managed by 'conda init' !!
+__conda_setup="$('/home/kade/miniconda3/bin/conda' 'shell.zsh' 'hook' 2> /dev/null)"
+if [ $? -eq 0 ]; then
+    eval "$__conda_setup"
+else
+    if [ -f "/home/kade/miniconda3/etc/profile.d/conda.sh" ]; then
+        . "/home/kade/miniconda3/etc/profile.d/conda.sh"
+    else
+        export PATH="/home/kade/miniconda3/bin:$PATH"
+    fi
+fi
+unset __conda_setup
+# <<< conda initialize <<<
+unset CONDA_CHANGEPS1
 # Source the Oh My Zsh script
 # This line loads Oh My Zsh, a popular framework for managing Zsh configuration
 # It sets up various features like themes, plugins, and custom functions
 # Alias for quickly editing and reloading the zsh configuration file
 alias ezc="nvim ~/.zshrc && source ~/.zshrc"
 display_git_help() {
   echo "Git"
   echo "---"
     done
 }
+# Function cs1
+# This function chops blocks from an SDXL LoRA's safetensors file to preserve the style information only.
+# It uses a specific block configuration and saves the output with a modified filename.
+cs1() {
+    # Get the target safetensors file path from the first argument
+    local target_safetensors=$1
+    # Extract the base name of the target safetensors file (without the .safetensors extension)
+    local base_name=$(basename "$target_safetensors" .safetensors)
+    # Extract the version and step string from the base name (e.g., v1s400)
+    local version_step=$(echo "$base_name" | grep -o 'v[0-9]*s[0-9]*')
+    # Remove the version and step string from the base name to avoid duplication
+    local base_name_no_version=$(echo "$base_name" | sed "s/${version_step}//")
+    # Construct the output safetensors filename by appending c1 to the version and step string
+    local output_safetensors="${base_name_no_version}${version_step}c1.safetensors"
+    # Run the chop_blocks command with the specified block configuration and output filename
+    ~/toolkit/chop_blocks "$target_safetensors" 1,0,0,0,0,0,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0 -o "$output_safetensors"
+}
+# Function cs2
+# This function chops blocks from an SDXL LoRA's safetensors file to preserve the style information only.
+# It uses a different block configuration and saves the output with a modified filename.
+cs2() {
+    # Get the target safetensors file path from the first argument
+    local target_safetensors=$1
+    # Extract the base name of the target safetensors file (without the .safetensors extension)
+    local base_name=$(basename "$target_safetensors" .safetensors)
+    # Extract the version and step string from the base name (e.g., v1s400)
+    local version_step=$(echo "$base_name" | grep -o 'v[0-9]*s[0-9]*')
+    # Remove the version and step string from the base name to avoid duplication
+    local base_name_no_version=$(echo "$base_name" | sed "s/${version_step}//")
+    # Construct the output safetensors filename by appending c2 to the version and step string
+    local output_safetensors="${base_name_no_version}${version_step}c2.safetensors"
+    # Run the chop_blocks command with the specified block configuration and output filename
+    ~/toolkit/chop_blocks "$target_safetensors" 1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0 -o "$output_safetensors"
+}
 # Function: swch (Switch Git Branch)
 # Description:
 #   This function facilitates switching between Git branches while ensuring a clean working directory.
 filePath = '$filePath'
 print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
 }
+source ~/toolkit/zsh/png2mp4.zsh
 # Function: c
 # Description:
     conda activate comfyui
     python main.py --listen 0.0.0.0 --preview-method taesd --use-pytorch-cross-attention --disable-xformers --front-end-version Comfy-Org/ComfyUI_frontend@latest --fast
 }
 #
 # Usage:
 #   conda_prompt_info

audiogen_medium.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import sys
+import torchaudio
+from audiocraft.models import AudioGen
+from audiocraft.data.audio import audio_write
+model = AudioGen.get_pretrained('facebook/audiogen-medium')
+model.set_generation_params(duration=5)  # generate 5 seconds.
+descriptions = sys.argv[1:]
+if not descriptions:
+    print('At least one prompt should be provided')
+    sys.exit(1)
+wav = model.generate(descriptions)  # generates 3 samples.
+for one_wav, description in zip(wav, descriptions):
+    file_name = description.replace(' ', '_')
+    # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
+    audio_write(file_name, one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)

{9em124t2-499968 → cgrkzexw-599808}/clip_model.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d7b0548d12fa649370896982c2af9d03d43285b782bd47639c96e6e0b29473c
 size 1713067838

 version https://git-lfs.github.com/spec/v1
+oid sha256:9277e041aab3e7f20a8e6ecf7248b663aac1c281daf4472c12a6e5013cf9f0cc
 size 1713067838

{9em124t2-499968 → cgrkzexw-599808}/config.yaml RENAMED Viewed

@@ -3,7 +3,7 @@ device_batch_size: 2
 batch_size: 256
 learning_rate: 0.0002
 warmup_samples: 18000
-max_samples: 500000
 save_every: 50000
 test_every: 50000
 use_amp: true
@@ -19,9 +19,9 @@ adam_beta2: 0.999
 adam_eps: 1.0e-08
 adam_weight_decay: 0.0
 clip_grad_norm: 1.0
-dataset: fancyfeast/joy-captioning-20240917a
 clip_model: google/siglip-so400m-patch14-384
-text_model: meta-llama/Meta-Llama-3.1-8B
 resume: null
 gradient_checkpointing: false
 test_size: 2048

 batch_size: 256
 learning_rate: 0.0002
 warmup_samples: 18000
+max_samples: 600000
 save_every: 50000
 test_every: 50000
 use_amp: true
 adam_eps: 1.0e-08
 adam_weight_decay: 0.0
 clip_grad_norm: 1.0
+dataset: fancyfeast/joy-captioning-20240924a
 clip_model: google/siglip-so400m-patch14-384
+text_model: ../lora-train/lora_model_vwbzycxh
 resume: null
 gradient_checkpointing: false
 test_size: 2048

cgrkzexw-599808/image_adapter.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38db2fe263be2d494a50be4a7bbfd7b23b76f9d03e4008a1b7df97d6b27894ef
+size 86067714

{9em124t2-499968 → cgrkzexw-599808}/text_model/README.md RENAMED Viewed

@@ -1,5 +1,5 @@
 ---
-base_model: meta-llama/Meta-Llama-3.1-8B
 library_name: peft
 ---

 ---
+base_model: unsloth/Meta-Llama-3.1-8B-Instruct
 library_name: peft
 ---

{9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_config.json RENAMED Viewed

@@ -1,7 +1,7 @@
 {
   "alpha_pattern": {},
   "auto_mapping": null,
-  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
@@ -11,7 +11,7 @@
   "layers_to_transform": null,
   "loftq_config": {},
   "lora_alpha": 16,
-  "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
@@ -21,7 +21,12 @@
   "revision": null,
   "target_modules": [
     "q_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

 {
   "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
   "layers_to_transform": null,
   "loftq_config": {},
   "lora_alpha": 16,
+  "lora_dropout": 0,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "revision": null,
   "target_modules": [
     "q_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

{9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ced50820de4649e284a17d104056fe828d692d1b87960e90dedfd2467351fdd
-size 109069176

 version https://git-lfs.github.com/spec/v1
+oid sha256:dd883ebd089f87e0fab7f17960c5f4451ceae43aecead44a9984b3369018dbdb
+size 671149168

cgrkzexw-599808/text_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|finetune_right_pad_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

cgrkzexw-599808/text_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cgrkzexw-599808/text_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2064 @@

+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content'] %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|finetune_right_pad_id|>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

crawl/crawl CHANGED Viewed

@@ -124,9 +124,9 @@ def save_result(target_url):
     # Choose the appropriate base path based on the operating system
     if platform.system() == "Windows":
-        base_path = "E:\\datasets\\knowledgebase\\Saved Websites\\"
-    elsek:
-        base_path = "/home/kade/datasets/knowledgebase/Saved Websites"
     save_dir = os.path.join(base_path, sanitized_title)
     os.makedirs(save_dir, exist_ok=True)

     # Choose the appropriate base path based on the operating system
     if platform.system() == "Windows":
+        base_path = "E:\\datasets\\ragpile\\Saved Websites\\"
+    else:
+        base_path = "/home/kade/datasets/ragpile/Saved Websites"
     save_dir = os.path.join(base_path, sanitized_title)
     os.makedirs(save_dir, exist_ok=True)

crawl/crawl_wikipedia CHANGED Viewed

@@ -126,9 +126,9 @@ def save_result(target_url):
     # Choose the appropriate base path based on the operating system
     if platform.system() == "Windows":
-        base_path = "E:\\knowledgebase\\Saved Websites\\"
     else:
-        base_path = "/home/kade/saved_websites/"
     save_dir = os.path.join(base_path, sanitized_title)
     os.makedirs(save_dir, exist_ok=True)

     # Choose the appropriate base path based on the operating system
     if platform.system() == "Windows":
+        base_path = "E:\\ragpile\\Saved Websites\\"
     else:
+        base_path = "/home/kade/datasets/ragpile/Saved Websites/"
     save_dir = os.path.join(base_path, sanitized_title)
     os.makedirs(save_dir, exist_ok=True)

joy CHANGED Viewed

@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """
-JoyCaption Alpha One
 This module provides functionality for generating captions for images using a
 combination of CLIP, LLM, and custom image adapters. It supports various
@@ -18,6 +18,7 @@ import os
 import argparse
 import re
 import random
 from pathlib import Path
 from typing import List, Tuple, Dict
 from PIL import Image
@@ -33,49 +34,56 @@ from transformers import (
 )
 from torch import nn
 from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
-CHECKPOINT_PATH = Path(__file__).resolve().parent / "9em124t2-499968"
 CAPTION_TYPE_MAP = {
-    ("descriptive", "formal", False, False): [
-        "Write a descriptive caption for this image in a formal tone."
     ],
-    ("descriptive", "formal", False, True): [
-        "Write a descriptive caption for this image in a formal tone within "
-        "{word_count} words."
     ],
-    ("descriptive", "formal", True, False): [
-        "Write a {length} descriptive caption for this image in a formal tone."
     ],
-    ("descriptive", "informal", False, False): [
-        "Write a descriptive caption for this image in a casual tone."
     ],
-    ("descriptive", "informal", False, True): [
-        "Write a descriptive caption for this image in a casual tone within "
-        "{word_count} words."
     ],
-    ("descriptive", "informal", True, False): [
-        "Write a {length} descriptive caption for this image in a casual tone."
     ],
-    ("training_prompt", "formal", False, False): [
-        "Write a stable diffusion prompt for this image."
     ],
-    ("training_prompt", "formal", False, True): [
-        "Write a stable diffusion prompt for this image within " +
-        "{word_count} words."
     ],
-    ("training_prompt", "formal", True, False): [
-        "Write a {length} stable diffusion prompt for this image."
-    ],
-    ("rng-tags", "formal", False, False): [
-        "Write a list of Booru tags for this image."
-    ],
-    ("rng-tags", "formal", False, True): [
-        "Write a list of Booru tags for this image within {word_count} words."
-    ],
-    ("rng-tags", "formal", True, False): [
-        "Write a {length} list of Booru tags for this image."
     ],
 }
@@ -176,8 +184,9 @@ class ImageAdapter(nn.Module):
         x = self.linear2(x)
         other_tokens = self.other_tokens(
-            torch.tensor([0, 1], device=self.other_tokens.weight.device)
-            .expand(x.shape[0], -1)
         )
         assert other_tokens.shape == (
             x.shape[0],
@@ -200,6 +209,13 @@ class ImageAdapter(nn.Module):
         ).squeeze(0)
 class JoyCaptionModel:
     """
     A class for generating captions for images using CLIP, LLM,
@@ -219,7 +235,7 @@ class JoyCaptionModel:
     Methods:
         load_models(): Load and initialize all required models.
-        process_image(input_image, caption_type, caption_tone, caption_length):
             Process an input image and generate a caption
             based on specified parameters.
     """
@@ -234,18 +250,17 @@ class JoyCaptionModel:
         """
         Load and initialize all required models (CLIP, LLM, image adapter).
         """
-        print("Loading CLIP")
         self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
         self.clip_model = self.clip_model.vision_model
         if (CHECKPOINT_PATH / "clip_model.pt").exists():
-            print("Loading VLM's custom vision model")
             checkpoint = torch.load(
                 CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
             )
             checkpoint = {
-                k.replace("_orig_mod.module.", ""): v
-                for k, v in checkpoint.items()
             }
             self.clip_model.load_state_dict(checkpoint)
             del checkpoint
@@ -254,21 +269,19 @@ class JoyCaptionModel:
         self.clip_model.requires_grad_(False)
         self.clip_model.to("cuda")
-        print("Loading tokenizer")
         self.tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_PATH, use_fast=False
         )
         assert isinstance(
             self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         )
-        print("Loading LLM")
         if (CHECKPOINT_PATH / "text_model").exists():
-            print("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
-                CHECKPOINT_PATH / "text_model",
-                device_map=0,
-                torch_dtype=torch.bfloat16
             )
         else:
             self.text_model = AutoModelForCausalLM.from_pretrained(
@@ -277,7 +290,7 @@ class JoyCaptionModel:
         self.text_model.eval()
-        print("Loading image adapter")
         self.image_adapter = ImageAdapter(
             self.clip_model.config.hidden_size,
             self.text_model.config.hidden_size,
@@ -287,10 +300,7 @@ class JoyCaptionModel:
             False,
         )
         self.image_adapter.load_state_dict(
-            torch.load(
-                CHECKPOINT_PATH / "image_adapter.pt",
-                map_location="cpu"
-            )
         )
         self.image_adapter.eval()
         self.image_adapter.to("cuda")
@@ -299,72 +309,120 @@ class JoyCaptionModel:
     def process_image(
         self,
         input_image: Image.Image,
-        caption_type: str,
-        caption_tone: str,
-        caption_length: str | int,
-        custom_prompt: str | None = None,
-    ) -> str:
         """
-        Process an input image and generate a caption based on specified
-        parameters.
         """
         torch.cuda.empty_cache()
-        if custom_prompt is not None:
-            prompt_str = custom_prompt
-        else:
-            prompt_str = self._get_prompt_string(
-                caption_type, caption_tone, caption_length
-            )
-        print(f"Prompt: {prompt_str}")
         pixel_values = self._preprocess_image(input_image)
-        prompt = self._tokenize_prompt(prompt_str)
         embedded_images = self._embed_image(pixel_values)
         inputs_embeds, input_ids, attention_mask = self._construct_inputs(
-            embedded_images, prompt
         )
-        generate_ids = self._generate_caption(inputs_embeds,
-                                              input_ids,
-                                              attention_mask)
         caption = self._decode_caption(generate_ids, input_ids)
-        return caption.strip()
     def generate_valid_caption(
         self,
         input_image: Image.Image,
-        caption_type: str,
-        caption_tone: str,
-        caption_length: str | int,
-        custom_prompt: str | None = None,
     ) -> str:
         """
-        Generate a valid caption, retrying if the caption contains only special
-        characters, does not end with a period, exclamation mark, or question
-        mark, contains the word fluffy more than once, repeats any word longer
-        than 4 characters multiple times, or contains only one sentence.
         """
         while True:
-            caption = self.process_image(
-                input_image, caption_type, caption_tone,
-                caption_length, custom_prompt
-            )
-            words = re.findall(r'\b\w{5,}\b', caption.lower())
-            word_counts = {word: words.count(word) for word in set(words)}
-            sentence_count = len(re.findall(r'[.!?]', caption))
-            if (re.search(r'\w', caption) and
-                    caption[-1] in {'.', '!', '?'} and
-                    caption.lower().count('fluffy') <= 1 and
-                    all(count == 1 for count in word_counts.values()) and
-                    sentence_count > 1):
                 return caption
-            print("Generated caption is invalid. Retrying...")
-    def _get_prompt_string(self, caption_type, caption_tone, caption_length):
         length = None if caption_length == "any" else caption_length
         if isinstance(length, str):
@@ -373,103 +431,128 @@ class JoyCaptionModel:
             except ValueError:
                 pass
-        if caption_type in {"rng-tags", "training_prompt"}:
-            caption_tone = "formal"
-        prompt_key = (
-            caption_type,
-            caption_tone,
-            isinstance(length, str),
-            isinstance(length, int),
-        )
-        if prompt_key not in CAPTION_TYPE_MAP:
-            raise ValueError(f"Invalid caption type: {prompt_key}")
-        prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
-            length=length, word_count=length
-        )
         return prompt_str
-    def _preprocess_image(self, input_image):
         image = input_image.resize((384, 384), Image.LANCZOS)
         pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
         pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-        pixel_values = pixel_values.to("cuda")
-        return pixel_values
-    def _tokenize_prompt(self, prompt_str):
-        prompt = self.tokenizer.encode(
-            prompt_str,
-            return_tensors="pt",
-            padding=False,
-            truncation=False,
-            add_special_tokens=False,
-        )
-        return prompt
-    def _embed_image(self, pixel_values):
         with torch.amp.autocast_mode.autocast("cuda", enabled=True):
             vision_outputs = self.clip_model(
                 pixel_values=pixel_values, output_hidden_states=True
             )
-            image_features = vision_outputs.hidden_states
-            embedded_images = self.image_adapter(image_features)
-            embedded_images = embedded_images.to("cuda")
-        return embedded_images
-    def _construct_inputs(self, embedded_images, prompt):
-        prompt_embeds = self.text_model.model.embed_tokens(prompt.to("cuda"))
-        assert prompt_embeds.shape == (
-            1,
-            prompt.shape[1],
-            self.text_model.config.hidden_size,
-        ), (
-            f"Prompt shape is {prompt_embeds.shape}, expected "
-            f"{(1, prompt.shape[1], self.text_model.config.hidden_size)}"
-        )
-        embedded_bos = self.text_model.model.embed_tokens(
-            torch.tensor(
-                [[self.tokenizer.bos_token_id]],
-                device=self.text_model.device,
-                dtype=torch.int64,
-            )
         )
-        eot_embed = (
-            self.image_adapter.get_eot_embedding()
-            .unsqueeze(0)
-            .to(dtype=self.text_model.dtype)
         )
-        inputs_embeds = torch.cat(
             [
-                embedded_bos.expand(embedded_images.shape[0], -1, -1),
-                embedded_images.to(dtype=embedded_bos.dtype),
-                prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-                eot_embed.expand(embedded_images.shape[0], -1, -1),
             ],
             dim=1,
-        )
         input_ids = torch.cat(
             [
-                torch.tensor(
-                    [[self.tokenizer.bos_token_id]], dtype=torch.long
-                ),
-                torch.zeros(
-                    (1, embedded_images.shape[1]), dtype=torch.long
-                ),
-                prompt,
-                torch.tensor(
-                    [[self.tokenizer.eos_token_id]], dtype=torch.long
-                ),
             ],
             dim=1,
         ).to("cuda")
         attention_mask = torch.ones_like(input_ids)
-        return inputs_embeds, input_ids, attention_mask
     def _generate_caption(self, inputs_embeds, input_ids, attention_mask):
         generate_ids = self.text_model.generate(
@@ -477,6 +560,7 @@ class JoyCaptionModel:
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             max_new_tokens=300,
             do_sample=True,
             suppress_tokens=None,
             repetition_penalty=1.2,
@@ -484,20 +568,73 @@ class JoyCaptionModel:
         return generate_ids
     def _decode_caption(self, generate_ids, input_ids):
-        generate_ids = generate_ids[:, input_ids.shape[1]:]
-        if (generate_ids[0][-1] == self.tokenizer.eos_token_id or
-            generate_ids[0][-1] == self.tokenizer.convert_tokens_to_ids(
-                "<|eot_id|>")):
             generate_ids = generate_ids[:, :-1]
         caption = self.tokenizer.batch_decode(
-            generate_ids,
-            skip_special_tokens=False,
-            clean_up_tokenization_spaces=False
         )[0]
         return caption
 def main():
     """
@@ -517,36 +654,21 @@ def main():
         "--caption_type",
         type=str,
         default="descriptive",
-        choices=["descriptive", "training_prompt", "rng-tags", "custom"],
         help="Type of caption to generate.",
     )
     parser.add_argument(
-        "--caption_tone",
-        type=str,
-        default="formal",
-        choices=["formal", "informal"],
-        help="Tone of the caption.",
-    )
-    parser.add_argument(
-        "--caption_length",
-        type=str,
-        default="any",
-        help="Length of the caption."
     )
     parser.add_argument(
         "--dont-strip-commas",
         action="store_true",
-        help=(
-            "If set, commas will not be stripped from the generated captions."
-        ),
     )
     parser.add_argument(
         "--custom_prompt",
         type=str,
-        help=(
-            "Custom prompt for the captioner. "
-            "Use with --caption_type custom."
-        ),
     )
     parser.add_argument(
         "--add-commas-to-sentence-ends",
@@ -564,6 +686,11 @@ def main():
             "Optionally specify the number of tags to use."
         ),
     )
     parser.add_argument(
         "--random-tags",
         type=int,
@@ -572,81 +699,105 @@ def main():
             "Only works if --feed-from-tags is enabled."
         ),
     )
     args = parser.parse_args()
-    # Validate random-tags usage
-    if args.random_tags is not None and args.feed_from_tags is None:
-        parser.error(
-            "--random-tags can only be used when --feed-from-tags is enabled"
-        )
-    print("Loading e621 tag data")
-    tagset_normalizer = make_tagset_normalizer()
-    # Initialize and load models
-    joy_caption_model = JoyCaptionModel()
-    joy_caption_model.load_models()
-    # Validate custom prompt usage
-    if args.caption_type == "custom" and not args.custom_prompt:
-        parser.error(
-            "--custom_prompt is required when using --caption_type custom"
-        )
-    elif args.caption_type != "custom" and args.custom_prompt:
-        parser.error(
-            "--custom_prompt can only be used with --caption_type custom"
-        )
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
         if image_path.suffix.lower() in image_extensions:
             caption_file = image_path.with_suffix(".caption")
             # Skip if the caption file already exists
             if caption_file.exists():
-                print(f"Skipping {image_path}: Caption file already exists.")
                 continue
             input_image = Image.open(image_path).convert("RGB")
-            # Use custom prompt if specified
-            custom_prompt = None
-            if args.caption_type == "custom":
-                custom_prompt = args.custom_prompt
-            elif args.feed_from_tags is not None:
-                custom_prompt = prompt_from_tags(
-                    args, image_path, tagset_normalizer
-                )
-            print(f"Custom prompt: {custom_prompt}")
-            caption = joy_caption_model.generate_valid_caption(
-                input_image,
-                args.caption_type,
-                args.caption_tone,
-                args.caption_length,
-                custom_prompt=custom_prompt,
             )
-            # Strip commas if the --dont-strip-commas flag is not set
-            if not args.dont_strip_commas:
-                # Existing comma stripping logic
-                caption = re.sub(r",\s*([^\d])", r" \1", caption)
-                # New feature: Add commas after periods if specified
-                if args.add_commas_to_sentence_ends:
-                    caption = re.sub(r"(\.)(\s+)([A-Z])", r"\1,\2\3", caption)
-            # Remove all newline characters
-            caption = caption.replace("\n", " ")
-            print(f"Caption for {image_path}:\n\n{caption}\n\n")
-            # Save the caption to a .caption file
-            with open(caption_file, "w", encoding="utf-8") as f:
-                f.write(caption)
-            print(f"Caption saved to {caption_file}")
 RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
@@ -723,11 +874,16 @@ TAG_CHARACTER = tag_category2id["character"]
 TAG_ARTIST = tag_category2id["artist"]
 TAG_COPYRIGHT = tag_category2id["copyright"]
 TAG_META = tag_category2id["meta"]
-TAG_FREQ_THRESH = 0
-def prompt_from_tags(args, image_path: Path,
-                     tagset_normalizer: TagSetNormalizer):
     """
     Generates a prompt from tags associated with the given image.
@@ -737,32 +893,35 @@ def prompt_from_tags(args, image_path: Path,
             The path to the image file.
         tagset_normalizer (TagSetNormalizer):
             An instance to normalize the tag set.
-    Returns:
-        None
     """
     tag_file = find_tag_file(image_path)
     if tag_file is None:
-        return None
     with open(tag_file, "r", encoding="utf-8") as f:
         tags = f.read().lower().split(",")
     tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
     encode = tagset_normalizer.tag_normalizer.encode
-    # These lists contain tuples (freq, tag, tag_id)
     tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
-        cat: []
-        for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
     }
     other_tags: List[Tuple[int, str, int]] = []
     implied: set = set()
     for tag in tags:
         tag = tag.strip()
         # Encode the tag into a numerical id
         tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
             other_tags.append((0, tag, 0))
             implied.update(tagset_normalizer.implications_rej.get(0, ()))
             continue
@@ -771,28 +930,29 @@ def prompt_from_tags(args, image_path: Path,
         # Skip meta tags
         if cat_id == TAG_META:
             continue
         implied.update(tagset_normalizer.implications.get(tag_id, ()))
         # Get the frequency of the tag
         freq = tag_rank_to_freq(tag_id)
-        if freq < TAG_FREQ_THRESH:
             continue
-        tag_by_category.get(cat_id, other_tags).append(
-            (int(freq), tag, tag_id)
-        )
     other_tags = sorted(
-        (int(freq), tag, tag_id)
         for freq, tag, tag_id in other_tags
         if tag_id not in implied
     )
     for cat_id, cat_list in tag_by_category.items():
         tag_by_category[cat_id] = sorted(
-            (int(freq), tag, tag_id)
-            for freq, tag, tag_id in cat_list
-            if tag_id not in implied
         )
     if args.random_tags is not None:
         # Randomly select tags if --random-tags is specified
         num_tags = min(args.random_tags, len(other_tags))
@@ -807,11 +967,10 @@ def prompt_from_tags(args, image_path: Path,
         # Use specified number of tags if --feed-from-tags has a positive value
         other_tags = other_tags[: args.feed_from_tags]
-    # Prepare sentence pieces
     artist_tag = tag_by_category[TAG_ARTIST]
     if artist_tag:
-        artist_list = [str(tp[1]).removeprefix('by ')
-                       for tp in artist_tag[:4]]
         artist_txt = f"by {format_nl_list(artist_list)}"
     else:
         artist_txt = ""
@@ -826,15 +985,13 @@ def prompt_from_tags(args, image_path: Path,
     species_tag = tag_by_category[TAG_SPECIES]
     if species_tag:
         species_txt = (
-            "of a "
-            if len(character_tag) <= 1 and len(species_tag) <= 1
-            else "of "
         )
         species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
     else:
         if character_tag:
             species_txt = (
-                " a character" if len(character_tag) <= 1 else " characters"
             )
         else:
             species_txt = ""
@@ -845,13 +1002,32 @@ def prompt_from_tags(args, image_path: Path,
         copyright_txt = f"from {format_nl_list(tags)}"
     else:
         copyright_txt = ""
     tag_string = ", ".join(tp[1] for tp in other_tags)
-    custom_prompt = ' '.join(s for s in [
-        "Write a descriptive caption for this image",
-        artist_txt, species_txt, character_txt, copyright_txt,
-        "in a formal tone. Use these tags to construct your caption:",
         tag_string,
-    ] if s)
     return custom_prompt
@@ -877,5 +1053,12 @@ def find_tag_file(image_path):
     return None
 if __name__ == "__main__":
     main()

 # -*- coding: utf-8 -*-
 """
+JoyCaption Alpha Two
 This module provides functionality for generating captions for images using a
 combination of CLIP, LLM, and custom image adapters. It supports various
 import argparse
 import re
 import random
+import math
 from pathlib import Path
 from typing import List, Tuple, Dict
 from PIL import Image
 )
 from torch import nn
 from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
+import logging
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
+CHECKPOINT_PATH = Path(__file__).resolve().parent / "cgrkzexw-599808"
 CAPTION_TYPE_MAP = {
+    "descriptive": [
+        "Write a descriptive caption for this image in a formal tone.",
+        "Write a descriptive caption for this image in a formal tone within {word_count} words.",
+        "Write a {length} descriptive caption for this image in a formal tone.",
     ],
+    "descriptive (informal)": [
+        "Write a descriptive caption for this image in a casual tone.",
+        "Write a descriptive caption for this image in a casual tone within {word_count} words.",
+        "Write a {length} descriptive caption for this image in a casual tone.",
     ],
+    "training prompt": [
+        "Write a stable diffusion prompt for this image.",
+        "Write a stable diffusion prompt for this image within {word_count} words.",
+        "Write a {length} stable diffusion prompt for this image.",
     ],
+    "midjourney": [
+        "Write a MidJourney prompt for this image.",
+        "Write a MidJourney prompt for this image within {word_count} words.",
+        "Write a {length} MidJourney prompt for this image.",
     ],
+    "booru tag list": [
+        "Write a list of Booru tags for this image.",
+        "Write a list of Booru tags for this image within {word_count} words.",
+        "Write a {length} list of Booru tags for this image.",
     ],
+    "booru-like tag list": [
+        "Write a list of Booru-like tags for this image.",
+        "Write a list of Booru-like tags for this image within {word_count} words.",
+        "Write a {length} list of Booru-like tags for this image.",
     ],
+    "art critic": [
+        "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
+        "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
+        "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
     ],
+    "product listing": [
+        "Write a caption for this image as though it were a product listing.",
+        "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
+        "Write a {length} caption for this image as though it were a product listing.",
     ],
+    "social media post": [
+        "Write a caption for this image as if it were being used for a social media post.",
+        "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
+        "Write a {length} caption for this image as if it were being used for a social media post.",
     ],
 }
         x = self.linear2(x)
         other_tokens = self.other_tokens(
+            torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(
+                x.shape[0], -1
+            )
         )
         assert other_tokens.shape == (
             x.shape[0],
         ).squeeze(0)
+STOP_WORDS: set[str] = set(
+    "i'll if we'd can't you'd shouldn't i'd only doesn't further isn't didn't has more aren't during do than were he's too here you against could few for ought won't we until weren't i've they're same up she but are how here's their over can under mustn't while on by had and an each he'd he about she'd am was she'll where's did out or that's it they'd a let's shall what's the to don't when below no any some from is hadn't all they i'm must in before who's own where you've that very them this not because it's shan't wasn't you'll when's most off i at other hasn't nor been such again we'll down above will so should into she's once have these why's be we've as being why those then with after may you're would haven't both wouldn't there cannot they've couldn't how's between does we're through he'll of there's they'll might".split(
+        " "
+    )
+)
 class JoyCaptionModel:
     """
     A class for generating captions for images using CLIP, LLM,
     Methods:
         load_models(): Load and initialize all required models.
+        process_image(input_image, caption_type, caption_length):
             Process an input image and generate a caption
             based on specified parameters.
     """
         """
         Load and initialize all required models (CLIP, LLM, image adapter).
         """
+        logging.info("Loading CLIP")
         self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
         self.clip_model = self.clip_model.vision_model
         if (CHECKPOINT_PATH / "clip_model.pt").exists():
+            logging.info("Loading VLM's custom vision model")
             checkpoint = torch.load(
                 CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
             )
             checkpoint = {
+                k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()
             }
             self.clip_model.load_state_dict(checkpoint)
             del checkpoint
         self.clip_model.requires_grad_(False)
         self.clip_model.to("cuda")
+        logging.info("Loading tokenizer")
         self.tokenizer = AutoTokenizer.from_pretrained(
+            CHECKPOINT_PATH / "text_model", use_fast=True
         )
         assert isinstance(
             self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         )
+        logging.info("Loading LLM")
         if (CHECKPOINT_PATH / "text_model").exists():
+            logging.info("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
+                CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
             )
         else:
             self.text_model = AutoModelForCausalLM.from_pretrained(
         self.text_model.eval()
+        logging.info("Loading image adapter")
         self.image_adapter = ImageAdapter(
             self.clip_model.config.hidden_size,
             self.text_model.config.hidden_size,
             False,
         )
         self.image_adapter.load_state_dict(
+            torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")
         )
         self.image_adapter.eval()
         self.image_adapter.to("cuda")
     def process_image(
         self,
         input_image: Image.Image,
+        prompt_str: str,
+    ) -> Tuple[str, float]:
         """
+        Process an input image and generate a caption based on specified parameters.
+        Also calculates the entropy of the generated caption.
+        Returns:
+            Tuple[str, float]: The generated caption and its entropy.
         """
         torch.cuda.empty_cache()
         pixel_values = self._preprocess_image(input_image)
         embedded_images = self._embed_image(pixel_values)
         inputs_embeds, input_ids, attention_mask = self._construct_inputs(
+            embedded_images, prompt_str
         )
+        generate_ids = self._generate_caption(inputs_embeds, input_ids, attention_mask)
         caption = self._decode_caption(generate_ids, input_ids)
+        # Calculate entropy
+        token_ids = generate_ids[0].tolist()
+        entropy = self._calculate_entropy(token_ids)
+        return caption.strip(), entropy
     def generate_valid_caption(
         self,
         input_image: Image.Image,
+        prompt: str,
+        *,
+        limited_words: Dict[str, int] = {"fluffy": 2},
+        min_sentence_count: int = 3,
+        max_word_repetitions: int = 5,
+        min_entropy: float = 1.75,
+        stop_words: set[str] = STOP_WORDS,
     ) -> str:
         """
+        Generate a valid caption, retrying if certain conditions are not met.
+        Args:
+            input_image (Image.Image): The input image to caption.
+            prompt (str | None): Prompt for caption generation.
+            limited_words (Dict[str, int]): Dictionary of words with their maximum allowed occurrences. Default is {"fluffy": 1}.
+            min_sentence_count (int): Minimum required number of sentences. Default is 3.
+            max_word_repetitions (int): Maximum allowed repetitions for words longer than 4 characters. Default is 15.
+            min_entropy (float): Minimum required entropy of the caption. Default is 2.3.
+        Returns:
+            str: A valid caption meeting all specified criteria.
+        The method retries caption generation if:
+        - The caption contains only special characters
+        - The caption does not end with a period, exclamation mark, or question mark
+        - Any word in limited_words appears more than its specified maximum times
+        - Any word longer than 4 characters is repeated more than max_word_repetitions times
+        - The caption contains fewer than min_sentence_count sentences
+        - The entropy of the caption is below min_entropy
         """
         while True:
+            caption, entropy = self.process_image(input_image, prompt)
+            words = re.findall(r"\b\w+\b", caption.lower())
+            word_counts = {
+                word: words.count(word) for word in set(words) if word not in stop_words
+            }
+            sentence_count = len(re.findall(r"[.!?]", caption))
+            if not re.search(r"\w", caption):
+                logging.info(
+                    f"Retrying: Caption contains only special characters.\nCaption: {caption!r}"
+                )
+            elif caption[-1] not in {".", "!", "?"}:
+                logging.info(
+                    f"Retrying: Caption does not end with proper punctuation.\nCaption: {caption!r}"
+                )
+            elif any(
+                caption.lower().count(word) > max_count
+                for word, max_count in limited_words.items()
+            ):
+                exceeded_words = [
+                    f"{word} ({caption.lower().count(word)}/{max_count})"
+                    for word, max_count in limited_words.items()
+                    if caption.lower().count(word) > max_count
+                ]
+                logging.info(
+                    f"Retrying: Limited words exceeded: {', '.join(exceeded_words)}.\nCaption: {caption!r}"
+                )
+            elif any(
+                count > max_word_repetitions
+                for word, count in word_counts.items()
+                if len(word) > 4
+            ):
+                repeated_words = [
+                    word
+                    for word, count in word_counts.items()
+                    if count > max_word_repetitions and len(word) > 4
+                ]
+                logging.info(
+                    f"Retrying: Words repeated more than {max_word_repetitions} times: {', '.join(repeated_words)}.\nCaption: {caption!r}"
+                )
+            elif sentence_count < min_sentence_count:
+                logging.info(
+                    f"Retrying: Only {sentence_count} sentences (min: {min_sentence_count}).\nCaption: {caption!r}"
+                )
+            elif entropy < min_entropy:
+                logging.info(
+                    f"Retrying: Low entropy ({entropy:.2f} < {min_entropy}).\nCaption: {caption!r}"
+                )
+            else:
                 return caption
+    @staticmethod
+    def get_prompt_string(caption_type, caption_length):
         length = None if caption_length == "any" else caption_length
         if isinstance(length, str):
             except ValueError:
                 pass
+        # Build prompt
+        if length is None:
+            map_idx = 0
+        elif isinstance(length, int):
+            map_idx = 1
+        elif isinstance(length, str):
+            map_idx = 2
+        else:
+            raise ValueError(f"Invalid caption length: {length}")
+        caption_type = caption_type.lower()
+        if caption_type not in CAPTION_TYPE_MAP:
+            raise ValueError(f"Invalid caption type: {caption_type}")
+        prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
+        prompt_str = prompt_str.format(length=caption_length, word_count=caption_length)
         return prompt_str
+    @staticmethod
+    def _preprocess_image(input_image: Image.Image) -> torch.Tensor:
+        """
+        Preprocess the input image for the CLIP model.
+        Args:
+            input_image (Image.Image): The input PIL image.
+        Returns:
+            torch.Tensor: Preprocessed image tensor.
+        """
         image = input_image.resize((384, 384), Image.LANCZOS)
         pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
         pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+        return pixel_values.to("cuda")
+    def _embed_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Embed the preprocessed image using CLIP and the image adapter.
+        Args:
+            pixel_values (torch.Tensor): Preprocessed image tensor.
+        Returns:
+            torch.Tensor: Embedded image tensor.
+        """
         with torch.amp.autocast_mode.autocast("cuda", enabled=True):
             vision_outputs = self.clip_model(
                 pixel_values=pixel_values, output_hidden_states=True
             )
+            embedded_images = self.image_adapter(vision_outputs.hidden_states)
+        return embedded_images.to("cuda")
+    def _construct_inputs(
+        self, embedded_images: torch.Tensor, prompt_str: str
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Construct the inputs for the text model's generate method.
+        Args:
+            embedded_images (torch.Tensor): Embedded image tensor.
+            prompt_str (str): The prompt string for captioning.
+        Returns:
+            tuple: (input_embeds, input_ids, attention_mask)
+        """
+        # Build the conversation
+        convo = [
+            {"role": "system", "content": "You are a helpful image captioner."},
+            {"role": "user", "content": prompt_str},
+        ]
+        # Format and tokenize the conversation
+        convo_string = self.tokenizer.apply_chat_template(
+            convo, tokenize=False, add_generation_prompt=True
         )
+        logging.debug(f"Convo:\n{convo_string}")
+        convo_tokens = self.tokenizer.encode(
+            convo_string,
+            return_tensors="pt",
+            add_special_tokens=False,
+            truncation=False,
+        )
+        prompt_tokens = self.tokenizer.encode(
+            prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False
+        )
+        convo_tokens = convo_tokens.squeeze(0)
+        prompt_tokens = prompt_tokens.squeeze(0)
+        # Calculate where to inject the image
+        eot_id_indices = (
+            (convo_tokens == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"))
+            .nonzero(as_tuple=True)[0]
+            .tolist()
+        )
+        preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]
+        # Embed the tokens
+        convo_embeds = self.text_model.model.embed_tokens(
+            convo_tokens.unsqueeze(0).to("cuda")
         )
+        # Construct the input
+        input_embeds = torch.cat(
             [
+                convo_embeds[:, :preamble_len],
+                embedded_images.to(dtype=convo_embeds.dtype),
+                convo_embeds[:, preamble_len:],
             ],
             dim=1,
+        ).to("cuda")
         input_ids = torch.cat(
             [
+                convo_tokens[:preamble_len].unsqueeze(0),
+                torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+                convo_tokens[preamble_len:].unsqueeze(0),
             ],
             dim=1,
         ).to("cuda")
         attention_mask = torch.ones_like(input_ids)
+        return input_embeds, input_ids, attention_mask
     def _generate_caption(self, inputs_embeds, input_ids, attention_mask):
         generate_ids = self.text_model.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             max_new_tokens=300,
+            # max_length=4096,
             do_sample=True,
             suppress_tokens=None,
             repetition_penalty=1.2,
         return generate_ids
     def _decode_caption(self, generate_ids, input_ids):
+        generate_ids = generate_ids[:, input_ids.shape[1] :]
+        if generate_ids[0][-1] == self.tokenizer.eos_token_id or generate_ids[0][
+            -1
+        ] == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"):
             generate_ids = generate_ids[:, :-1]
         caption = self.tokenizer.batch_decode(
+            generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
         )[0]
         return caption
+    def _calculate_entropy(self, token_ids: List[int]) -> float:
+        """
+        Calculate the entropy of a sequence of token IDs.
+        Args:
+            token_ids (List[int]): List of token IDs.
+        Returns:
+            float: Entropy of the token sequence.
+        """
+        token_counts = {}
+        total_tokens = len(token_ids)
+        for token_id in token_ids:
+            token_counts[token_id] = token_counts.get(token_id, 0) + 1
+        entropy = 0
+        for count in token_counts.values():
+            probability = count / total_tokens
+            entropy -= probability * math.log2(probability)
+        return entropy
+class ColoredFormatter(logging.Formatter):
+    COLORS = {
+        "DEBUG": "\033[36m",  # Cyan
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[31;1m",  # Bright Red
+    }
+    RESET = "\033[0m"
+    def format(self, record):
+        log_message = super().format(record)
+        return f"{self.COLORS.get(record.levelname, '')}{log_message}{self.RESET}"
+def setup_logging(verbosity):
+    if verbosity == 0:
+        log_level = logging.INFO
+    elif verbosity == 1:
+        log_level = logging.DEBUG
+    handler = logging.StreamHandler()
+    formatter = ColoredFormatter(
+        fmt="%(asctime)s | %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    handler.setFormatter(formatter)
+    logger = logging.getLogger()
+    logger.setLevel(log_level)
+    logger.addHandler(handler)
 def main():
     """
         "--caption_type",
         type=str,
         default="descriptive",
+        choices=CAPTION_TYPE_MAP.keys(),
         help="Type of caption to generate.",
     )
     parser.add_argument(
+        "--caption_length", type=str, default="any", help="Length of the caption."
     )
     parser.add_argument(
         "--dont-strip-commas",
         action="store_true",
+        help=("If set, commas will not be stripped from the generated captions."),
     )
     parser.add_argument(
         "--custom_prompt",
         type=str,
+        help=("Custom prompt for the captioner. " "Use with --caption_type custom."),
     )
     parser.add_argument(
         "--add-commas-to-sentence-ends",
             "Optionally specify the number of tags to use."
         ),
     )
+    parser.add_argument(
+        "--artist-from-folder",
+        action="store_true",
+        help="Get the artist name from the parent folder",
+    )
     parser.add_argument(
         "--random-tags",
         type=int,
             "Only works if --feed-from-tags is enabled."
         ),
     )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Run in dry-run mode without loading models or generating captions.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="Increase output verbosity (can be repeated)",
+    )
     args = parser.parse_args()
+    setup_logging(args.verbose)
+    tasks = []
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
         if image_path.suffix.lower() in image_extensions:
             caption_file = image_path.with_suffix(".caption")
             # Skip if the caption file already exists
             if caption_file.exists():
+                logging.info(f"Skipping {image_path}: Caption file already exists.")
                 continue
+            tasks.append((image_path, caption_file))
+    if not tasks:
+        logging.error("No input file found.")
+        return
+    # Validate random-tags usage
+    if args.random_tags is not None and args.feed_from_tags is None:
+        parser.error("--random-tags can only be used when --feed-from-tags is enabled")
+    if args.feed_from_tags is not None and args.artist_from_folder:
+        raise ValueError("feed-from-tags and artist-from-folder can't be used together")
+    if args.feed_from_tags is not None:
+        logging.info("Loading e621 tag data")
+        tagset_normalizer = make_tagset_normalizer()
+    # Initialize and load models only if not in dry-run mode
+    if not args.dry_run:
+        joy_caption_model = JoyCaptionModel()
+        joy_caption_model.load_models()
+    else:
+        logging.info("Running in dry-run mode. Models will not be loaded.")
+    for image_path, caption_file in tasks:
+        if not args.dry_run:
             input_image = Image.open(image_path).convert("RGB")
+        # Use custom prompt if specified
+        prompt = args.custom_prompt or JoyCaptionModel.get_prompt_string(
+            args.caption_type, args.caption_length
+        )
+        if args.feed_from_tags is not None:
+            prompt = prompt_from_tags(args, image_path, tagset_normalizer, prompt)
+        elif args.artist_from_folder:
+            prompt = prompt_from_folder(prompt, image_path.resolve())
+        if args.dry_run:
+            logging.info(
+                f"Dry run: Skipping caption generation for {image_path} with prompt:\n\t{prompt}"
             )
+            continue
+        else:
+            logging.info(f"Prompt for {image_path}:\n\t{prompt}")
+        caption = joy_caption_model.generate_valid_caption(input_image, prompt)
+        # Replace multiple spaces with a single space
+        caption = " ".join(caption.split())
+        # Replace multiple newlines with a single newline
+        caption = "\n".join(
+            line for line in (line.strip() for line in caption.split("\n")) if line
+        )
+        # Strip commas if the --dont-strip-commas flag is not set
+        if not args.dont_strip_commas:
+            # Existing comma stripping logic
+            caption = re.sub(r",\s*([^\d])", r" \1", caption)
+            # New feature: Add commas after periods if specified
+            if args.add_commas_to_sentence_ends:
+                caption = re.sub(r"(\.)(\s+)([A-Z])", r"\1,\2\3", caption)
+        # Remove all newline characters
+        caption = caption.replace("\n", " ")
+        logging.info(f"Caption for {image_path}:\n\t{caption}\n\n")
+        # Save the caption to a .caption file
+        with open(caption_file, "w", encoding="utf-8") as f:
+            f.write(caption)
+        logging.info(f"Caption saved to {caption_file}")
 RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
 TAG_ARTIST = tag_category2id["artist"]
 TAG_COPYRIGHT = tag_category2id["copyright"]
 TAG_META = tag_category2id["meta"]
+def prompt_from_tags(
+    args,
+    image_path: Path,
+    tagset_normalizer: TagSetNormalizer,
+    base_prompt: str = "Write a descriptive caption for this image in a formal tone.",
+    tag_freq_threshold: int = 0,
+    tag_string_prefix: str = "Use these tags to construct your caption:",
+):
     """
     Generates a prompt from tags associated with the given image.
             The path to the image file.
         tagset_normalizer (TagSetNormalizer):
             An instance to normalize the tag set.
     """
+    # Find and read the corresponding tag file
     tag_file = find_tag_file(image_path)
     if tag_file is None:
+        logging.warning(f"No tag file found for {image_path}")
+        return base_prompt
     with open(tag_file, "r", encoding="utf-8") as f:
         tags = f.read().lower().split(",")
+    # Get helper functions from the tagset_normalizer
     tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
     encode = tagset_normalizer.tag_normalizer.encode
+    # Initialize dictionaries and lists to store categorized tags
+    # These lists will contain tuples (freq, tag, tag_id)
     tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
+        cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
     }
     other_tags: List[Tuple[int, str, int]] = []
     implied: set = set()
+    # Process each tag
     for tag in tags:
         tag = tag.strip()
         # Encode the tag into a numerical id
         tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
+            # If tag is not recognized, add it to other_tags
             other_tags.append((0, tag, 0))
             implied.update(tagset_normalizer.implications_rej.get(0, ()))
             continue
         # Skip meta tags
         if cat_id == TAG_META:
             continue
+        # Update implied tags
         implied.update(tagset_normalizer.implications.get(tag_id, ()))
         # Get the frequency of the tag
         freq = tag_rank_to_freq(tag_id)
+        if freq < tag_freq_threshold:
             continue
+        # Add the tag to its category, or other_tags
+        tag_by_category.get(cat_id, other_tags).append((int(freq), tag, tag_id))
+    # Sort other_tags by frequency (descending) and filter out implied tags
     other_tags = sorted(
+        (-freq, tag, tag_id)
         for freq, tag, tag_id in other_tags
         if tag_id not in implied
     )
+    # Sort tags within each category, prefering non implied tags
     for cat_id, cat_list in tag_by_category.items():
         tag_by_category[cat_id] = sorted(
+            ((tag_id in implied, -freq), tag, tag_id) for freq, tag, tag_id in cat_list
         )
+    # Handle random tag selection or tag limit if specified
     if args.random_tags is not None:
         # Randomly select tags if --random-tags is specified
         num_tags = min(args.random_tags, len(other_tags))
         # Use specified number of tags if --feed-from-tags has a positive value
         other_tags = other_tags[: args.feed_from_tags]
+    # Prepare sentence pieces for each category
     artist_tag = tag_by_category[TAG_ARTIST]
     if artist_tag:
+        artist_list = [str(tp[1]).removeprefix("by ") for tp in artist_tag[:4]]
         artist_txt = f"by {format_nl_list(artist_list)}"
     else:
         artist_txt = ""
     species_tag = tag_by_category[TAG_SPECIES]
     if species_tag:
         species_txt = (
+            "of a " if len(character_tag) <= 1 and len(species_tag) <= 1 else "of "
         )
         species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
     else:
         if character_tag:
             species_txt = (
+                "of a character" if len(character_tag) <= 1 else "of characters"
             )
         else:
             species_txt = ""
         copyright_txt = f"from {format_nl_list(tags)}"
     else:
         copyright_txt = ""
+    # Prepare the remaining tags as a string
     tag_string = ", ".join(tp[1] for tp in other_tags)
+    # Extract the prefix and suffix around the word "image" from the base prompt
+    image_pos = base_prompt.find("image")
+    if image_pos < 0:
+        raise ValueError("Base prompt must contain the word 'image'")
+    image_pos += len("image")
+    base_prompt_prefix = base_prompt[:image_pos].rstrip()
+    base_prompt_suffix = base_prompt[image_pos:].lstrip()
+    pieces = [
+        base_prompt_prefix,
+        artist_txt,
+        species_txt,
+        character_txt,
+        copyright_txt,
+        base_prompt_suffix,
+        tag_string_prefix,
         tag_string,
+        ".",
+    ]
+    logging.debug("Prompt pieces: %r", pieces)
+    custom_prompt = " ".join(p for p in pieces if p)
+    custom_prompt = custom_prompt.replace(" .", ".").replace(" ,", ",")
     return custom_prompt
     return None
+def prompt_from_folder(prompt, path):
+    artist = (
+        path.parent.name.replace("_", " ").replace("-", " ").replace(".", " ").title()
+    )
+    return prompt.replace("image", f"image by {artist}")
 if __name__ == "__main__":
     main()

keyframe CHANGED Viewed

File without changes

ogg2wav ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/zsh
+# Function to convert ogg to wav
+convert_ogg_to_wav() {
+    local input_file="$1"
+    local output_file="${input_file:r}.wav"
+    ffmpeg -i "$input_file" "$output_file"
+    echo "Converted: $input_file -> $output_file"
+}
+# Set the target directory
+if [[ $# -eq 0 ]]; then
+    target_dir="."
+else
+    target_dir="$1"
+fi
+# Check if the target directory exists
+if [[ ! -d "$target_dir" ]]; then
+    echo "Error: Directory '$target_dir' does not exist."
+    exit 1
+fi
+# Find all .ogg files in the target directory and its subdirectories
+ogg_files=($(find "$target_dir" -type f -name "*.ogg"))
+# Check if any .ogg files were found
+if [[ ${#ogg_files[@]} -eq 0 ]]; then
+    echo "No .ogg files found in '$target_dir' or its subdirectories."
+    exit 0
+fi
+# Convert each .ogg file to .wav
+for file in "${ogg_files[@]}"; do
+    convert_ogg_to_wav "$file"
+done
+echo "Conversion complete."

paper-qa.code-workspace DELETED Viewed

@@ -1,11 +0,0 @@
-{
-	"folders": [
-		{
-			"path": "."
-		},
-		{
-			"path": "../miniconda3/lib/python3.12/site-packages/paperqa"
-		}
-	],
-	"settings": {}
-}

remove_extra_whitespace ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+This script removes all extra spaces (more than one) and new line characters (truncating to one single character)
+from all *.caption and *.txt files in a target directory recursively. If no target directory is provided as an
+argument, it processes the current directory.
+Usage:
+    python script_name.py [target_directory]
+Args:
+    target_directory (str, optional): The path to the target directory. If not provided, the current directory is used.
+"""
+import os
+import sys
+import glob
+def remove_extra_spaces_and_newlines(file_path):
+    """
+    Removes extra spaces (more than one) and new line characters from the given file.
+    Truncates the text to a single space or new line character without removing any text.
+    Args:
+        file_path (str): The path to the file to be processed.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read()
+    # Replace multiple spaces with a single space
+    content = ' '.join(content.split())
+    # Replace multiple newlines with a single newline
+    content = '\n'.join(line.strip() for line in content.split('\n'))
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(content)
+def process_files_in_directory(directory):
+    """
+    Processes all *.caption and *.txt files in the given directory recursively.
+    Removes extra spaces and new line characters from each file.
+    Args:
+        directory (str): The path to the directory to be processed.
+    """
+    for file_path in glob.glob(os.path.join(directory, '**', '*.caption'), recursive=True):
+        remove_extra_spaces_and_newlines(file_path)
+    for file_path in glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True):
+        remove_extra_spaces_and_newlines(file_path)
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        target_directory = sys.argv[1]
+    else:
+        target_directory = os.getcwd()
+    process_files_in_directory(target_directory)

remove_tag ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import pathlib
+import re
+def remove_word_from_file(file_path, word):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read()
+    # Remove the word with the comma and space if there is one after it
+    pattern = re.compile(r'\b' + re.escape(word) + r',?\s?')
+    new_content = pattern.sub('', content)
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(new_content)
+def remove_word_from_directory(directory, word):
+    path = pathlib.Path(directory)
+    for txt_file in path.rglob('*.txt'):
+        remove_word_from_file(txt_file, word)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <directory> <word>")
+        sys.exit(1)
+    target_directory = sys.argv[1]
+    target_word = sys.argv[2]
+    remove_word_from_directory(target_directory, target_word)

stats ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env python3
+import os
+import sys
+from collections import Counter
+# ANSI color codes
+RED = "\033[91m"
+GREEN = "\033[92m"
+ORANGE = "\033[93m"
+BLUE = "\033[94m"
+MAGENTA = "\033[95m"
+CYAN = "\033[96m"
+RESET = "\033[0m"
+EXT2COLOR = {
+    "jxl": CYAN,
+    "png": MAGENTA,
+    "jpg": RED,
+    "jpeg": RED,
+    "webp": MAGENTA,
+    "caption": BLUE,
+    "txt": BLUE,
+}
+EXT2ORDER = {ext: i for i, ext in enumerate(EXT2COLOR.keys())}
+SORT_KEYS = ["name", "count", "image", "text", *EXT2COLOR.keys()]
+TEXT_FORMATS = {"txt", "caption"}
+IMAGE_FORMATS = EXT2COLOR.keys() - TEXT_FORMATS
+def count_files(directory):
+    file_counts = Counter()
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            base_name, ext = os.path.splitext(file)
+            if len(ext) > 1:
+                ext = ext[1:]
+                file_counts[ext] += 1
+                if ext in IMAGE_FORMATS:
+                    file_counts["image"] += 1
+                elif ext in TEXT_FORMATS:
+                    file_counts["text"] += 1
+    return file_counts
+def main():
+    sort_key_name = "name"
+    sort_reverse = False
+    if len(sys.argv) > 1:
+        sort_key_name = sys.argv[1]
+    if sort_key_name.endswith("_r"):
+        sort_reverse = True
+        sort_key_name = sort_key_name[:-2]
+    if sort_key_name == "name":
+        sort_key = lambda x: x[0]
+    elif sort_key_name == "count":
+        sort_key = lambda x: x[1]
+    elif sort_key_name in SORT_KEYS:
+        sort_key = lambda x: x[2].get(sort_key_name, 0)
+    else:
+        print(f'Valid short key are {", ".join(f'"{k}"' for k in SORT_KEYS)}')
+        print('Prepending "_r" to reverse the sort order')
+        sys.exit(1)
+    current_directory = os.getcwd()
+    directories = (
+        d
+        for d in os.listdir(current_directory)
+        if os.path.isdir(os.path.join(current_directory, d))
+    )
+    stats = []
+    grand_total = Counter()
+    for directory in directories:
+        dir_path = os.path.join(current_directory, directory)
+        counts = count_files(dir_path)
+        total_files = sum(v for k,v in counts.items() if k in EXT2ORDER)
+        stats.append((directory, total_files, counts))
+        grand_total.update(counts)
+    stats.sort(key=sort_key, reverse=sort_reverse)
+    stats.append((None, sum(v for k,v in grand_total.items() if k in EXT2ORDER), grand_total))
+    for directory, total_files, counts in stats:
+        if total_files == 0:
+            continue
+        if directory is None:
+            print(f'Grand Total: ')
+        print(f"Directory: {directory}")
+        for ext, count in sorted(
+            counts.items(), key=lambda x: EXT2ORDER.get(x[0], -1)
+        ):
+            if counts[ext] == 0 or ext not in EXT2COLOR:
+                continue
+            print(f"{EXT2COLOR[ext]}{ext} files: {counts[ext]}{RESET}")
+        tally_color = GREEN if total_files >= 200 else ORANGE
+        print(
+            f"{tally_color}Total files: {total_files}{RESET} ({counts['image']} images, {counts['text']} texts)"
+        )
+        print()
+if __name__ == "__main__":
+    main()

whisper ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import torch
+from transformers import pipeline
+import sys
+import os
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+BATCH_SIZE = 8
+device = 0 if torch.cuda.is_available() else "cpu"
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    chunk_length_s=30,
+    device=device,
+)
+def transcribe(audio_file_path, task="transcribe"):
+    if not os.path.exists(audio_file_path):
+        print(f"Error: The file '{audio_file_path}' does not exist.")
+        return
+    try:
+        text = pipe(audio_file_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+        return text
+    except Exception as e:
+        print(f"Error during transcription: {str(e)}")
+        return None
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python script.py <audio_file_path> [task]")
+        print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
+        sys.exit(1)
+    audio_file_path = sys.argv[1]
+    task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
+    if task not in ["transcribe", "translate"]:
+        print("Error: task must be either 'transcribe' or 'translate'")
+        sys.exit(1)
+    result = transcribe(audio_file_path, task)
+    if result:
+        print("Transcription result:")
+        print(result)

whisper2 ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import sys
+import os
+import warnings
+# Suppress specific warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+MODEL_NAME = "openai/whisper-large-v3"
+BATCH_SIZE = 8
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    #    max_new_tokens=448,
+    chunk_length_s=30,
+    batch_size=BATCH_SIZE,
+    return_timestamps=True,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+def transcribe(audio_file_path, task="transcribe"):
+    if not os.path.exists(audio_file_path):
+        print(f"Error: The file '{audio_file_path}' does not exist.")
+        return
+    try:
+        with torch.no_grad():
+            result = pipe(audio_file_path, generate_kwargs={"task": task})
+        from pprint import pprint
+        pprint(result)
+        return result["text"]
+    except Exception as e:
+        print(f"Error during transcription: {str(e)}")
+        return None
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python script.py <audio_file_path> [task]")
+        print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
+        sys.exit(1)
+    audio_file_path = sys.argv[1]
+    task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
+    if task not in ["transcribe", "translate"]:
+        print("Error: task must be either 'transcribe' or 'translate'")
+        sys.exit(1)
+    result = transcribe(audio_file_path, task)
+    if result:
+        print("Transcription result:")
+        print(result)

zsh/png2mp4.zsh ADDED Viewed

	@@ -0,0 +1,172 @@

+# png2mp4_x265()
+# Converts a series of PNG images to an MP4 video using x265 encoding
+#
+# Usage: png2mp4_x265 [--max <number>] [--step <number>] [--repeat <number>]
+#
+# Options:
+#   --max <number>    : Maximum number of images to process
+#   --step <number>   : Multiplier for step count in overlay text (default: 1)
+#   --repeat <number> : Number of times to repeat each image (default: 1)
+#
+# This function:
+# 1. Deactivates conda environment
+# 2. Finds all PNG files in the current directory
+# 3. Uses the current directory name as the output filename prefix
+# 4. Uses ffmpeg to create an MP4 with x265 encoding, including:
+#    - Frame rate of 60 fps
+#    - Image scaling to 1024x1024
+#    - Step count overlay text (divided by 1,000,000 and truncated to remove 6 zeros and decimal places)
+#    - High-quality encoding settings
+# 5. Adds padding and fade-out effect to the final video
+# 6. Reactivates conda environment
+#
+# Requirements:
+#   - ffmpeg with libx265 support
+#   - Inconsolata-Light.ttf font in /usr/share/fonts/TTF/
+png2mp4_x265() {
+    conda deactivate
+    local step_multiplier=1
+    local repeat=1
+    local frame_rate=60
+    local resolution="1024x1024"
+    while [[ "$#" -gt 0 ]]; do
+        case $1 in
+            --step) step_multiplier="$2"; shift ;;
+            --repeat) repeat="$2"; shift ;;
+            *) echo "Unknown parameter passed: $1"; return 1 ;;
+        esac
+        shift
+    done
+    output_filename="$(basename "$(pwd)")_x265.mp4"
+    echo "Output filename: $output_filename"
+    local nframes=$(find . -type f -name '*.png' | wc -l)
+    local duration=$(($nframes * $repeat / ${frame_rate}.))
+    local fade_start=$((duration + 3))
+    echo "Found $nframes for a duration of $duration seconds"
+    echo "Running ffmpeg with x265 encoding..."
+    local font=/usr/share/fonts/TTF/Inconsolata-Light.ttf
+    local drawtext="drawtext=fontfile=${font}:text='Steps\: %{eif\\:trunc(n*$step_multiplier)\\:u\\:3}':x=10:y=h-th-10:fontsize=24:fontcolor=white"
+    local fadeout="tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=${fade_start}:d=5"
+    local encoder=(
+        -pix_fmt yuv420p
+        -c:v libx265
+        -preset slower
+        -tune animation
+        -crf 22
+        -x265-params "keyint=${repeat}:min-keyint=$((repeat-1)):scenecut=0:ref=5:bframes=8:b-adapt=2:rc-lookahead=$((2*repeat)):lookahead-slices=4:aq-mode=3:aq-strength=0.8:deblock=-1,-1:sao=0"
+    )
+    ffmpeg -framerate "$frame_rate/$repeat" -pattern_type glob -i "*.png" \
+        -vf "scale=${resolution},${drawtext},fps=${frame_rate},${fadeout}" \
+        "${encoder[@]}" -y sample_x265.mp4
+    if [ $? -ne 0 ]; then
+        echo "Error: ffmpeg command failed."
+        return 1
+    fi
+    conda activate
+    echo "Process completed successfully."
+}
+# png2mp4()
+# Converts a series of PNG images to an MP4 video using x264 encoding
+#
+# Usage: png2mp4 [--max <number>] [--step <number>] [--repeat <number>]
+#
+# Options:
+#   --max <number>    : Maximum number of images to process
+#   --step <number>   : Multiplier for step count in overlay text (default: 1)
+#   --repeat <number> : Number of times to repeat each image (default: 1)
+#
+# This function:
+# 1. Deactivates conda environment
+# 2. Creates a temporary directory for processing
+# 3. Finds all PNG files in the current directory
+# 4. Uses the current directory name as the output filename prefix
+# 5. Copies and optionally repeats images to the temp directory
+# 6. Uses ffmpeg to create an MP4 with x264 encoding, including:
+#    - Frame rate of 60 fps
+#    - Image scaling to 1024x1024
+#    - Step count overlay text (divided by 1,000,000 and truncated to remove 6 zeros and decimal places)
+#    - CRF value of 28 for compression
+# 7. Adds padding and fade-out effect to the final video
+# 8. Cleans up temporary files
+# 9. Reactivates conda environment
+#
+# Requirements:
+#   - ffmpeg with libx264 support
+#   - bc (basic calculator)
+#   - Inconsolata-Light.ttf font in /usr/share/fonts/TTF/
+png2mp4() {
+    conda deactivate
+    local max_images=""
+    local step_multiplier=1
+    local repeat=1
+    local temp_dir="/home/kade/.local/tmp"
+    while [[ "$#" -gt 0 ]]; do
+        case $1 in
+            --max) max_images="$2"; shift ;;
+            --step) step_multiplier="$2"; shift ;;
+            --repeat) repeat="$2"; shift ;;
+            *) echo "Unknown parameter passed: $1"; return 1 ;;
+        esac
+        shift
+    done
+    echo "Creating temporary directory..."
+    mkdir -p "$temp_dir"
+    echo "Checking for PNG files..."
+    png_files=($(/usr/bin/env ls *.png 2>/dev/null))
+    if [ ${#png_files[@]} -eq 0 ]; then
+        echo "Error: No PNG files found in the current directory."
+        return 1
+    fi
+    echo "Setting output filename..."
+    output_filename="$(basename "$(pwd)").mp4"
+    echo "Output filename: $output_filename"
+    echo "Creating repeated images..."
+    for img in "${png_files[@]}"; do
+        for i in $(seq 1 $repeat); do
+            cp "$img" "$temp_dir/${img%.*}_${i}.png"
+        done
+    done
+    echo "Running ffmpeg..."
+    if [[ -n "$max_images" ]]; then
+        ffmpeg -framerate 60 -pattern_type glob -i "$temp_dir/*.png" -vf "scale=1024x1024,select='not(mod(n\,$max_images))',drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{expr\:trunc(n*$step_multiplier/$repeat/1000000)}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
+        -c:v libx264 -pix_fmt yuv420p -y "$temp_dir/temp.mp4"
+    else
+        ffmpeg -framerate 60 -pattern_type glob -i "$temp_dir/*.png" -vf "scale=1024x1024,drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{expr\:trunc(n*$step_multiplier/$repeat/1000000)}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
+        -c:v libx264 -pix_fmt yuv420p -y "$temp_dir/temp.mp4"
+    fi
+    if [ $? -ne 0 ]; then
+        echo "Error: ffmpeg command failed."
+        rm -rf "$temp_dir"
+        return 1
+    fi
+    echo "Processing final video..."
+    duration=$(ffmpeg -i "$temp_dir/temp.mp4" 2>&1 | grep 'Duration' | awk '{print $2}' | tr -d , | awk -F: '{print ($1 * 3600) + ($2 * 60) + $3}')
+    fade_start=$(echo "$duration + 3" | bc)
+    ffmpeg -i "$temp_dir/temp.mp4" -vf "tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=$fade_start:d=5" -c:v libx264 -pix_fmt yuv420p -y "$output_filename"
+    if [ $? -ne 0 ]; then
+        echo "Error: Final ffmpeg processing failed."
+        rm -rf "$temp_dir"
+        return 1
+    fi
+    echo "Cleaning up temporary files..."
+    rm -rf "$temp_dir"
+    conda activate
+    echo "Process completed successfully."
+}