Merge branch 'main' of hf.co:/k4d3/toolkit
Browse files- .zshrc +86 -56
- audiogen_medium.py +18 -0
- {9em124t2-499968 → cgrkzexw-599808}/clip_model.pt +1 -1
- {9em124t2-499968 → cgrkzexw-599808}/config.yaml +3 -3
- cgrkzexw-599808/image_adapter.pt +3 -0
- {9em124t2-499968 → cgrkzexw-599808}/text_model/README.md +1 -1
- {9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_config.json +8 -3
- {9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_model.safetensors +2 -2
- cgrkzexw-599808/text_model/special_tokens_map.json +23 -0
- cgrkzexw-599808/text_model/tokenizer.json +0 -0
- cgrkzexw-599808/text_model/tokenizer_config.json +2064 -0
- crawl/crawl +3 -3
- crawl/crawl_wikipedia +2 -2
- joy +455 -272
- keyframe +0 -0
- ogg2wav +38 -0
- paper-qa.code-workspace +0 -11
- remove_extra_whitespace +60 -0
- remove_tag +33 -0
- stats +108 -0
- whisper +49 -0
- whisper2 +72 -0
- zsh/png2mp4.zsh +172 -0
.zshrc
CHANGED
@@ -8,6 +8,20 @@
|
|
8 |
# - conda-env: Adds support for Conda environment management
|
9 |
# 4. Set the custom theme for the shell prompt
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Load the custom git wrapper script
|
12 |
source $HOME/toolkit/git-wrapper.zsh
|
13 |
|
@@ -70,6 +84,30 @@ export OMP_NUM_THREADS=24
|
|
70 |
# However, it may slightly reduce performance in some scenarios where parallel tokenization is beneficial
|
71 |
export TOKENIZERS_PARALLELISM=false
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
# Source the Oh My Zsh script
|
74 |
# This line loads Oh My Zsh, a popular framework for managing Zsh configuration
|
75 |
# It sets up various features like themes, plugins, and custom functions
|
@@ -156,30 +194,6 @@ alias rl="source ~/.zshrc"
|
|
156 |
# Alias for quickly editing and reloading the zsh configuration file
|
157 |
alias ezc="nvim ~/.zshrc && source ~/.zshrc"
|
158 |
|
159 |
-
# Source the broot launcher script for enhanced file navigation
|
160 |
-
source /home/kade/.config/broot/launcher/bash/br
|
161 |
-
|
162 |
-
# Source the fzf (Fuzzy Finder) configuration for zsh if it exists
|
163 |
-
# This enables fzf functionality in the shell, including keybindings and auto-completion
|
164 |
-
[ -f ~/.fzf.zsh ] && source ~/.fzf.zsh
|
165 |
-
|
166 |
-
# >>> conda initialize >>>
|
167 |
-
# !! Contents within this block are managed by 'conda init' !!
|
168 |
-
__conda_setup="$('/home/kade/miniconda3/bin/conda' 'shell.zsh' 'hook' 2> /dev/null)"
|
169 |
-
if [ $? -eq 0 ]; then
|
170 |
-
eval "$__conda_setup"
|
171 |
-
else
|
172 |
-
if [ -f "/home/kade/miniconda3/etc/profile.d/conda.sh" ]; then
|
173 |
-
. "/home/kade/miniconda3/etc/profile.d/conda.sh"
|
174 |
-
else
|
175 |
-
export PATH="/home/kade/miniconda3/bin:$PATH"
|
176 |
-
fi
|
177 |
-
fi
|
178 |
-
unset __conda_setup
|
179 |
-
# <<< conda initialize <<<
|
180 |
-
|
181 |
-
unset CONDA_CHANGEPS1
|
182 |
-
|
183 |
display_git_help() {
|
184 |
echo "Git"
|
185 |
echo "---"
|
@@ -769,6 +783,52 @@ chop_lora() {
|
|
769 |
done
|
770 |
}
|
771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
772 |
# Function: swch (Switch Git Branch)
|
773 |
# Description:
|
774 |
# This function facilitates switching between Git branches while ensuring a clean working directory.
|
@@ -857,34 +917,8 @@ import safetensors, json
|
|
857 |
filePath = '$filePath'
|
858 |
print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
|
859 |
}
|
860 |
-
|
861 |
-
|
862 |
-
# This function converts a series of PNG images into an MP4 video file using ffmpeg.
|
863 |
-
#
|
864 |
-
# Usage:
|
865 |
-
# png2mp4
|
866 |
-
#
|
867 |
-
# Parameters:
|
868 |
-
# None (uses all PNG files in the current directory)
|
869 |
-
#
|
870 |
-
# Actions:
|
871 |
-
# 1. Sets the frame rate to 8 fps
|
872 |
-
# 2. Uses glob pattern to include all PNG files in the current directory
|
873 |
-
# 3. Scales the output video to 1024x1024 resolution
|
874 |
-
# 4. Sets the Constant Rate Factor (CRF) to 28 for good compression
|
875 |
-
# 5. Uses the libx264 codec for H.264 encoding
|
876 |
-
# 6. Sets the pixel format to yuv420p for compatibility
|
877 |
-
# 7. Outputs the result as 'out.mp4' in the current directory
|
878 |
-
#
|
879 |
-
# Notes:
|
880 |
-
# - Requires ffmpeg to be installed and accessible in the system path
|
881 |
-
# - PNG files should be in the current directory
|
882 |
-
# - Output video will be named 'out.mp4' and placed in the current directory
|
883 |
-
# - Adjust the framerate, scale, or CRF value as needed for different results
|
884 |
-
png2mp4() {
|
885 |
-
ffmpeg -framerate 8 -pattern_type glob -i '*.png' -vf scale=1024x1024 -crf 28 \
|
886 |
-
-c:v libx264 -pix_fmt yuv420p out.mp4
|
887 |
-
}
|
888 |
|
889 |
# Function: c
|
890 |
# Description:
|
@@ -920,10 +954,6 @@ c() {
|
|
920 |
conda activate comfyui
|
921 |
python main.py --listen 0.0.0.0 --preview-method taesd --use-pytorch-cross-attention --disable-xformers --front-end-version Comfy-Org/ComfyUI_frontend@latest --fast
|
922 |
}
|
923 |
-
|
924 |
-
# Function: conda_prompt_info
|
925 |
-
# Description:
|
926 |
-
# This function displays information about the currently active Conda environment.
|
927 |
#
|
928 |
# Usage:
|
929 |
# conda_prompt_info
|
|
|
8 |
# - conda-env: Adds support for Conda environment management
|
9 |
# 4. Set the custom theme for the shell prompt
|
10 |
|
11 |
+
# The `export QT_QPA_PLATFORM=offscreen` command is used to set the `QT_QPA_PLATFORM`
|
12 |
+
# environment variable to `offscreen`. This is particularly useful when running Qt
|
13 |
+
# applications in a headless environment, such as a server or a CI/CD pipeline,
|
14 |
+
# where there is no display server available. By setting this variable, Qt
|
15 |
+
# applications can render their graphical output offscreen, allowing them to
|
16 |
+
# run without requiring a graphical user interface (GUI). This is commonly used for
|
17 |
+
# automated testing, rendering, or other tasks that do not require user interaction.
|
18 |
+
export QT_QPA_PLATFORM=offscreen
|
19 |
+
|
20 |
+
# Enable the experimental Just-In-Time (JIT) compiler for Python 3.13.
|
21 |
+
# This can improve performance by compiling Python code to machine code at runtime.
|
22 |
+
# Note: The JIT is only available for x86_64 builds of Python in conda[^1^][1].
|
23 |
+
export PYTHON_JIT=1
|
24 |
+
|
25 |
# Load the custom git wrapper script
|
26 |
source $HOME/toolkit/git-wrapper.zsh
|
27 |
|
|
|
84 |
# However, it may slightly reduce performance in some scenarios where parallel tokenization is beneficial
|
85 |
export TOKENIZERS_PARALLELISM=false
|
86 |
|
87 |
+
# Source the broot launcher script for enhanced file navigation
|
88 |
+
source /home/kade/.config/broot/launcher/bash/br
|
89 |
+
|
90 |
+
# Source the fzf (Fuzzy Finder) configuration for zsh if it exists
|
91 |
+
# This enables fzf functionality in the shell, including keybindings and auto-completion
|
92 |
+
[ -f ~/.fzf.zsh ] && source ~/.fzf.zsh
|
93 |
+
|
94 |
+
# >>> conda initialize >>>
|
95 |
+
# !! Contents within this block are managed by 'conda init' !!
|
96 |
+
__conda_setup="$('/home/kade/miniconda3/bin/conda' 'shell.zsh' 'hook' 2> /dev/null)"
|
97 |
+
if [ $? -eq 0 ]; then
|
98 |
+
eval "$__conda_setup"
|
99 |
+
else
|
100 |
+
if [ -f "/home/kade/miniconda3/etc/profile.d/conda.sh" ]; then
|
101 |
+
. "/home/kade/miniconda3/etc/profile.d/conda.sh"
|
102 |
+
else
|
103 |
+
export PATH="/home/kade/miniconda3/bin:$PATH"
|
104 |
+
fi
|
105 |
+
fi
|
106 |
+
unset __conda_setup
|
107 |
+
# <<< conda initialize <<<
|
108 |
+
|
109 |
+
unset CONDA_CHANGEPS1
|
110 |
+
|
111 |
# Source the Oh My Zsh script
|
112 |
# This line loads Oh My Zsh, a popular framework for managing Zsh configuration
|
113 |
# It sets up various features like themes, plugins, and custom functions
|
|
|
194 |
# Alias for quickly editing and reloading the zsh configuration file
|
195 |
alias ezc="nvim ~/.zshrc && source ~/.zshrc"
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
display_git_help() {
|
198 |
echo "Git"
|
199 |
echo "---"
|
|
|
783 |
done
|
784 |
}
|
785 |
|
786 |
+
# Function cs1
|
787 |
+
# This function chops blocks from an SDXL LoRA's safetensors file to preserve the style information only.
|
788 |
+
# It uses a specific block configuration and saves the output with a modified filename.
|
789 |
+
cs1() {
|
790 |
+
# Get the target safetensors file path from the first argument
|
791 |
+
local target_safetensors=$1
|
792 |
+
|
793 |
+
# Extract the base name of the target safetensors file (without the .safetensors extension)
|
794 |
+
local base_name=$(basename "$target_safetensors" .safetensors)
|
795 |
+
|
796 |
+
# Extract the version and step string from the base name (e.g., v1s400)
|
797 |
+
local version_step=$(echo "$base_name" | grep -o 'v[0-9]*s[0-9]*')
|
798 |
+
|
799 |
+
# Remove the version and step string from the base name to avoid duplication
|
800 |
+
local base_name_no_version=$(echo "$base_name" | sed "s/${version_step}//")
|
801 |
+
|
802 |
+
# Construct the output safetensors filename by appending c1 to the version and step string
|
803 |
+
local output_safetensors="${base_name_no_version}${version_step}c1.safetensors"
|
804 |
+
|
805 |
+
# Run the chop_blocks command with the specified block configuration and output filename
|
806 |
+
~/toolkit/chop_blocks "$target_safetensors" 1,0,0,0,0,0,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0 -o "$output_safetensors"
|
807 |
+
}
|
808 |
+
|
809 |
+
# Function cs2
|
810 |
+
# This function chops blocks from an SDXL LoRA's safetensors file to preserve the style information only.
|
811 |
+
# It uses a different block configuration and saves the output with a modified filename.
|
812 |
+
cs2() {
|
813 |
+
# Get the target safetensors file path from the first argument
|
814 |
+
local target_safetensors=$1
|
815 |
+
|
816 |
+
# Extract the base name of the target safetensors file (without the .safetensors extension)
|
817 |
+
local base_name=$(basename "$target_safetensors" .safetensors)
|
818 |
+
|
819 |
+
# Extract the version and step string from the base name (e.g., v1s400)
|
820 |
+
local version_step=$(echo "$base_name" | grep -o 'v[0-9]*s[0-9]*')
|
821 |
+
|
822 |
+
# Remove the version and step string from the base name to avoid duplication
|
823 |
+
local base_name_no_version=$(echo "$base_name" | sed "s/${version_step}//")
|
824 |
+
|
825 |
+
# Construct the output safetensors filename by appending c2 to the version and step string
|
826 |
+
local output_safetensors="${base_name_no_version}${version_step}c2.safetensors"
|
827 |
+
|
828 |
+
# Run the chop_blocks command with the specified block configuration and output filename
|
829 |
+
~/toolkit/chop_blocks "$target_safetensors" 1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0 -o "$output_safetensors"
|
830 |
+
}
|
831 |
+
|
832 |
# Function: swch (Switch Git Branch)
|
833 |
# Description:
|
834 |
# This function facilitates switching between Git branches while ensuring a clean working directory.
|
|
|
917 |
filePath = '$filePath'
|
918 |
print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
|
919 |
}
|
920 |
+
|
921 |
+
source ~/toolkit/zsh/png2mp4.zsh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
922 |
|
923 |
# Function: c
|
924 |
# Description:
|
|
|
954 |
conda activate comfyui
|
955 |
python main.py --listen 0.0.0.0 --preview-method taesd --use-pytorch-cross-attention --disable-xformers --front-end-version Comfy-Org/ComfyUI_frontend@latest --fast
|
956 |
}
|
|
|
|
|
|
|
|
|
957 |
#
|
958 |
# Usage:
|
959 |
# conda_prompt_info
|
audiogen_medium.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import torchaudio
|
3 |
+
from audiocraft.models import AudioGen
|
4 |
+
from audiocraft.data.audio import audio_write
|
5 |
+
|
6 |
+
model = AudioGen.get_pretrained('facebook/audiogen-medium')
|
7 |
+
model.set_generation_params(duration=5) # generate 5 seconds.
|
8 |
+
descriptions = sys.argv[1:]
|
9 |
+
if not descriptions:
|
10 |
+
print('At least one prompt should be provided')
|
11 |
+
sys.exit(1)
|
12 |
+
wav = model.generate(descriptions) # generates 3 samples.
|
13 |
+
|
14 |
+
for one_wav, description in zip(wav, descriptions):
|
15 |
+
file_name = description.replace(' ', '_')
|
16 |
+
# Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
|
17 |
+
audio_write(file_name, one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
|
18 |
+
|
{9em124t2-499968 → cgrkzexw-599808}/clip_model.pt
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1713067838
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9277e041aab3e7f20a8e6ecf7248b663aac1c281daf4472c12a6e5013cf9f0cc
|
3 |
size 1713067838
|
{9em124t2-499968 → cgrkzexw-599808}/config.yaml
RENAMED
@@ -3,7 +3,7 @@ device_batch_size: 2
|
|
3 |
batch_size: 256
|
4 |
learning_rate: 0.0002
|
5 |
warmup_samples: 18000
|
6 |
-
max_samples:
|
7 |
save_every: 50000
|
8 |
test_every: 50000
|
9 |
use_amp: true
|
@@ -19,9 +19,9 @@ adam_beta2: 0.999
|
|
19 |
adam_eps: 1.0e-08
|
20 |
adam_weight_decay: 0.0
|
21 |
clip_grad_norm: 1.0
|
22 |
-
dataset: fancyfeast/joy-captioning-
|
23 |
clip_model: google/siglip-so400m-patch14-384
|
24 |
-
text_model:
|
25 |
resume: null
|
26 |
gradient_checkpointing: false
|
27 |
test_size: 2048
|
|
|
3 |
batch_size: 256
|
4 |
learning_rate: 0.0002
|
5 |
warmup_samples: 18000
|
6 |
+
max_samples: 600000
|
7 |
save_every: 50000
|
8 |
test_every: 50000
|
9 |
use_amp: true
|
|
|
19 |
adam_eps: 1.0e-08
|
20 |
adam_weight_decay: 0.0
|
21 |
clip_grad_norm: 1.0
|
22 |
+
dataset: fancyfeast/joy-captioning-20240924a
|
23 |
clip_model: google/siglip-so400m-patch14-384
|
24 |
+
text_model: ../lora-train/lora_model_vwbzycxh
|
25 |
resume: null
|
26 |
gradient_checkpointing: false
|
27 |
test_size: 2048
|
cgrkzexw-599808/image_adapter.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38db2fe263be2d494a50be4a7bbfd7b23b76f9d03e4008a1b7df97d6b27894ef
|
3 |
+
size 86067714
|
{9em124t2-499968 → cgrkzexw-599808}/text_model/README.md
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
base_model:
|
3 |
library_name: peft
|
4 |
---
|
5 |
|
|
|
1 |
---
|
2 |
+
base_model: unsloth/Meta-Llama-3.1-8B-Instruct
|
3 |
library_name: peft
|
4 |
---
|
5 |
|
{9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_config.json
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"alpha_pattern": {},
|
3 |
"auto_mapping": null,
|
4 |
-
"base_model_name_or_path": "
|
5 |
"bias": "none",
|
6 |
"fan_in_fan_out": false,
|
7 |
"inference_mode": true,
|
@@ -11,7 +11,7 @@
|
|
11 |
"layers_to_transform": null,
|
12 |
"loftq_config": {},
|
13 |
"lora_alpha": 16,
|
14 |
-
"lora_dropout": 0
|
15 |
"megatron_config": null,
|
16 |
"megatron_core": "megatron.core",
|
17 |
"modules_to_save": null,
|
@@ -21,7 +21,12 @@
|
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
"q_proj",
|
24 |
-
"v_proj"
|
|
|
|
|
|
|
|
|
|
|
25 |
],
|
26 |
"task_type": "CAUSAL_LM",
|
27 |
"use_dora": false,
|
|
|
1 |
{
|
2 |
"alpha_pattern": {},
|
3 |
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct",
|
5 |
"bias": "none",
|
6 |
"fan_in_fan_out": false,
|
7 |
"inference_mode": true,
|
|
|
11 |
"layers_to_transform": null,
|
12 |
"loftq_config": {},
|
13 |
"lora_alpha": 16,
|
14 |
+
"lora_dropout": 0,
|
15 |
"megatron_config": null,
|
16 |
"megatron_core": "megatron.core",
|
17 |
"modules_to_save": null,
|
|
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"gate_proj",
|
26 |
+
"down_proj",
|
27 |
+
"o_proj",
|
28 |
+
"k_proj",
|
29 |
+
"up_proj"
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
{9em124t2-499968 → cgrkzexw-599808}/text_model/adapter_model.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd883ebd089f87e0fab7f17960c5f4451ceae43aecead44a9984b3369018dbdb
|
3 |
+
size 671149168
|
cgrkzexw-599808/text_model/special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|begin_of_text|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|eot_id|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|finetune_right_pad_id|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
cgrkzexw-599808/text_model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cgrkzexw-599808/text_model/tokenizer_config.json
ADDED
@@ -0,0 +1,2064 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"128000": {
|
4 |
+
"content": "<|begin_of_text|>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"128001": {
|
12 |
+
"content": "<|end_of_text|>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"128002": {
|
20 |
+
"content": "<|reserved_special_token_0|>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"128003": {
|
28 |
+
"content": "<|reserved_special_token_1|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"128004": {
|
36 |
+
"content": "<|finetune_right_pad_id|>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"128005": {
|
44 |
+
"content": "<|reserved_special_token_2|>",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"128006": {
|
52 |
+
"content": "<|start_header_id|>",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
},
|
59 |
+
"128007": {
|
60 |
+
"content": "<|end_header_id|>",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
+
},
|
67 |
+
"128008": {
|
68 |
+
"content": "<|eom_id|>",
|
69 |
+
"lstrip": false,
|
70 |
+
"normalized": false,
|
71 |
+
"rstrip": false,
|
72 |
+
"single_word": false,
|
73 |
+
"special": true
|
74 |
+
},
|
75 |
+
"128009": {
|
76 |
+
"content": "<|eot_id|>",
|
77 |
+
"lstrip": false,
|
78 |
+
"normalized": false,
|
79 |
+
"rstrip": false,
|
80 |
+
"single_word": false,
|
81 |
+
"special": true
|
82 |
+
},
|
83 |
+
"128010": {
|
84 |
+
"content": "<|python_tag|>",
|
85 |
+
"lstrip": false,
|
86 |
+
"normalized": false,
|
87 |
+
"rstrip": false,
|
88 |
+
"single_word": false,
|
89 |
+
"special": true
|
90 |
+
},
|
91 |
+
"128011": {
|
92 |
+
"content": "<|reserved_special_token_3|>",
|
93 |
+
"lstrip": false,
|
94 |
+
"normalized": false,
|
95 |
+
"rstrip": false,
|
96 |
+
"single_word": false,
|
97 |
+
"special": true
|
98 |
+
},
|
99 |
+
"128012": {
|
100 |
+
"content": "<|reserved_special_token_4|>",
|
101 |
+
"lstrip": false,
|
102 |
+
"normalized": false,
|
103 |
+
"rstrip": false,
|
104 |
+
"single_word": false,
|
105 |
+
"special": true
|
106 |
+
},
|
107 |
+
"128013": {
|
108 |
+
"content": "<|reserved_special_token_5|>",
|
109 |
+
"lstrip": false,
|
110 |
+
"normalized": false,
|
111 |
+
"rstrip": false,
|
112 |
+
"single_word": false,
|
113 |
+
"special": true
|
114 |
+
},
|
115 |
+
"128014": {
|
116 |
+
"content": "<|reserved_special_token_6|>",
|
117 |
+
"lstrip": false,
|
118 |
+
"normalized": false,
|
119 |
+
"rstrip": false,
|
120 |
+
"single_word": false,
|
121 |
+
"special": true
|
122 |
+
},
|
123 |
+
"128015": {
|
124 |
+
"content": "<|reserved_special_token_7|>",
|
125 |
+
"lstrip": false,
|
126 |
+
"normalized": false,
|
127 |
+
"rstrip": false,
|
128 |
+
"single_word": false,
|
129 |
+
"special": true
|
130 |
+
},
|
131 |
+
"128016": {
|
132 |
+
"content": "<|reserved_special_token_8|>",
|
133 |
+
"lstrip": false,
|
134 |
+
"normalized": false,
|
135 |
+
"rstrip": false,
|
136 |
+
"single_word": false,
|
137 |
+
"special": true
|
138 |
+
},
|
139 |
+
"128017": {
|
140 |
+
"content": "<|reserved_special_token_9|>",
|
141 |
+
"lstrip": false,
|
142 |
+
"normalized": false,
|
143 |
+
"rstrip": false,
|
144 |
+
"single_word": false,
|
145 |
+
"special": true
|
146 |
+
},
|
147 |
+
"128018": {
|
148 |
+
"content": "<|reserved_special_token_10|>",
|
149 |
+
"lstrip": false,
|
150 |
+
"normalized": false,
|
151 |
+
"rstrip": false,
|
152 |
+
"single_word": false,
|
153 |
+
"special": true
|
154 |
+
},
|
155 |
+
"128019": {
|
156 |
+
"content": "<|reserved_special_token_11|>",
|
157 |
+
"lstrip": false,
|
158 |
+
"normalized": false,
|
159 |
+
"rstrip": false,
|
160 |
+
"single_word": false,
|
161 |
+
"special": true
|
162 |
+
},
|
163 |
+
"128020": {
|
164 |
+
"content": "<|reserved_special_token_12|>",
|
165 |
+
"lstrip": false,
|
166 |
+
"normalized": false,
|
167 |
+
"rstrip": false,
|
168 |
+
"single_word": false,
|
169 |
+
"special": true
|
170 |
+
},
|
171 |
+
"128021": {
|
172 |
+
"content": "<|reserved_special_token_13|>",
|
173 |
+
"lstrip": false,
|
174 |
+
"normalized": false,
|
175 |
+
"rstrip": false,
|
176 |
+
"single_word": false,
|
177 |
+
"special": true
|
178 |
+
},
|
179 |
+
"128022": {
|
180 |
+
"content": "<|reserved_special_token_14|>",
|
181 |
+
"lstrip": false,
|
182 |
+
"normalized": false,
|
183 |
+
"rstrip": false,
|
184 |
+
"single_word": false,
|
185 |
+
"special": true
|
186 |
+
},
|
187 |
+
"128023": {
|
188 |
+
"content": "<|reserved_special_token_15|>",
|
189 |
+
"lstrip": false,
|
190 |
+
"normalized": false,
|
191 |
+
"rstrip": false,
|
192 |
+
"single_word": false,
|
193 |
+
"special": true
|
194 |
+
},
|
195 |
+
"128024": {
|
196 |
+
"content": "<|reserved_special_token_16|>",
|
197 |
+
"lstrip": false,
|
198 |
+
"normalized": false,
|
199 |
+
"rstrip": false,
|
200 |
+
"single_word": false,
|
201 |
+
"special": true
|
202 |
+
},
|
203 |
+
"128025": {
|
204 |
+
"content": "<|reserved_special_token_17|>",
|
205 |
+
"lstrip": false,
|
206 |
+
"normalized": false,
|
207 |
+
"rstrip": false,
|
208 |
+
"single_word": false,
|
209 |
+
"special": true
|
210 |
+
},
|
211 |
+
"128026": {
|
212 |
+
"content": "<|reserved_special_token_18|>",
|
213 |
+
"lstrip": false,
|
214 |
+
"normalized": false,
|
215 |
+
"rstrip": false,
|
216 |
+
"single_word": false,
|
217 |
+
"special": true
|
218 |
+
},
|
219 |
+
"128027": {
|
220 |
+
"content": "<|reserved_special_token_19|>",
|
221 |
+
"lstrip": false,
|
222 |
+
"normalized": false,
|
223 |
+
"rstrip": false,
|
224 |
+
"single_word": false,
|
225 |
+
"special": true
|
226 |
+
},
|
227 |
+
"128028": {
|
228 |
+
"content": "<|reserved_special_token_20|>",
|
229 |
+
"lstrip": false,
|
230 |
+
"normalized": false,
|
231 |
+
"rstrip": false,
|
232 |
+
"single_word": false,
|
233 |
+
"special": true
|
234 |
+
},
|
235 |
+
"128029": {
|
236 |
+
"content": "<|reserved_special_token_21|>",
|
237 |
+
"lstrip": false,
|
238 |
+
"normalized": false,
|
239 |
+
"rstrip": false,
|
240 |
+
"single_word": false,
|
241 |
+
"special": true
|
242 |
+
},
|
243 |
+
"128030": {
|
244 |
+
"content": "<|reserved_special_token_22|>",
|
245 |
+
"lstrip": false,
|
246 |
+
"normalized": false,
|
247 |
+
"rstrip": false,
|
248 |
+
"single_word": false,
|
249 |
+
"special": true
|
250 |
+
},
|
251 |
+
"128031": {
|
252 |
+
"content": "<|reserved_special_token_23|>",
|
253 |
+
"lstrip": false,
|
254 |
+
"normalized": false,
|
255 |
+
"rstrip": false,
|
256 |
+
"single_word": false,
|
257 |
+
"special": true
|
258 |
+
},
|
259 |
+
"128032": {
|
260 |
+
"content": "<|reserved_special_token_24|>",
|
261 |
+
"lstrip": false,
|
262 |
+
"normalized": false,
|
263 |
+
"rstrip": false,
|
264 |
+
"single_word": false,
|
265 |
+
"special": true
|
266 |
+
},
|
267 |
+
"128033": {
|
268 |
+
"content": "<|reserved_special_token_25|>",
|
269 |
+
"lstrip": false,
|
270 |
+
"normalized": false,
|
271 |
+
"rstrip": false,
|
272 |
+
"single_word": false,
|
273 |
+
"special": true
|
274 |
+
},
|
275 |
+
"128034": {
|
276 |
+
"content": "<|reserved_special_token_26|>",
|
277 |
+
"lstrip": false,
|
278 |
+
"normalized": false,
|
279 |
+
"rstrip": false,
|
280 |
+
"single_word": false,
|
281 |
+
"special": true
|
282 |
+
},
|
283 |
+
"128035": {
|
284 |
+
"content": "<|reserved_special_token_27|>",
|
285 |
+
"lstrip": false,
|
286 |
+
"normalized": false,
|
287 |
+
"rstrip": false,
|
288 |
+
"single_word": false,
|
289 |
+
"special": true
|
290 |
+
},
|
291 |
+
"128036": {
|
292 |
+
"content": "<|reserved_special_token_28|>",
|
293 |
+
"lstrip": false,
|
294 |
+
"normalized": false,
|
295 |
+
"rstrip": false,
|
296 |
+
"single_word": false,
|
297 |
+
"special": true
|
298 |
+
},
|
299 |
+
"128037": {
|
300 |
+
"content": "<|reserved_special_token_29|>",
|
301 |
+
"lstrip": false,
|
302 |
+
"normalized": false,
|
303 |
+
"rstrip": false,
|
304 |
+
"single_word": false,
|
305 |
+
"special": true
|
306 |
+
},
|
307 |
+
"128038": {
|
308 |
+
"content": "<|reserved_special_token_30|>",
|
309 |
+
"lstrip": false,
|
310 |
+
"normalized": false,
|
311 |
+
"rstrip": false,
|
312 |
+
"single_word": false,
|
313 |
+
"special": true
|
314 |
+
},
|
315 |
+
"128039": {
|
316 |
+
"content": "<|reserved_special_token_31|>",
|
317 |
+
"lstrip": false,
|
318 |
+
"normalized": false,
|
319 |
+
"rstrip": false,
|
320 |
+
"single_word": false,
|
321 |
+
"special": true
|
322 |
+
},
|
323 |
+
"128040": {
|
324 |
+
"content": "<|reserved_special_token_32|>",
|
325 |
+
"lstrip": false,
|
326 |
+
"normalized": false,
|
327 |
+
"rstrip": false,
|
328 |
+
"single_word": false,
|
329 |
+
"special": true
|
330 |
+
},
|
331 |
+
"128041": {
|
332 |
+
"content": "<|reserved_special_token_33|>",
|
333 |
+
"lstrip": false,
|
334 |
+
"normalized": false,
|
335 |
+
"rstrip": false,
|
336 |
+
"single_word": false,
|
337 |
+
"special": true
|
338 |
+
},
|
339 |
+
"128042": {
|
340 |
+
"content": "<|reserved_special_token_34|>",
|
341 |
+
"lstrip": false,
|
342 |
+
"normalized": false,
|
343 |
+
"rstrip": false,
|
344 |
+
"single_word": false,
|
345 |
+
"special": true
|
346 |
+
},
|
347 |
+
"128043": {
|
348 |
+
"content": "<|reserved_special_token_35|>",
|
349 |
+
"lstrip": false,
|
350 |
+
"normalized": false,
|
351 |
+
"rstrip": false,
|
352 |
+
"single_word": false,
|
353 |
+
"special": true
|
354 |
+
},
|
355 |
+
"128044": {
|
356 |
+
"content": "<|reserved_special_token_36|>",
|
357 |
+
"lstrip": false,
|
358 |
+
"normalized": false,
|
359 |
+
"rstrip": false,
|
360 |
+
"single_word": false,
|
361 |
+
"special": true
|
362 |
+
},
|
363 |
+
"128045": {
|
364 |
+
"content": "<|reserved_special_token_37|>",
|
365 |
+
"lstrip": false,
|
366 |
+
"normalized": false,
|
367 |
+
"rstrip": false,
|
368 |
+
"single_word": false,
|
369 |
+
"special": true
|
370 |
+
},
|
371 |
+
"128046": {
|
372 |
+
"content": "<|reserved_special_token_38|>",
|
373 |
+
"lstrip": false,
|
374 |
+
"normalized": false,
|
375 |
+
"rstrip": false,
|
376 |
+
"single_word": false,
|
377 |
+
"special": true
|
378 |
+
},
|
379 |
+
"128047": {
|
380 |
+
"content": "<|reserved_special_token_39|>",
|
381 |
+
"lstrip": false,
|
382 |
+
"normalized": false,
|
383 |
+
"rstrip": false,
|
384 |
+
"single_word": false,
|
385 |
+
"special": true
|
386 |
+
},
|
387 |
+
"128048": {
|
388 |
+
"content": "<|reserved_special_token_40|>",
|
389 |
+
"lstrip": false,
|
390 |
+
"normalized": false,
|
391 |
+
"rstrip": false,
|
392 |
+
"single_word": false,
|
393 |
+
"special": true
|
394 |
+
},
|
395 |
+
"128049": {
|
396 |
+
"content": "<|reserved_special_token_41|>",
|
397 |
+
"lstrip": false,
|
398 |
+
"normalized": false,
|
399 |
+
"rstrip": false,
|
400 |
+
"single_word": false,
|
401 |
+
"special": true
|
402 |
+
},
|
403 |
+
"128050": {
|
404 |
+
"content": "<|reserved_special_token_42|>",
|
405 |
+
"lstrip": false,
|
406 |
+
"normalized": false,
|
407 |
+
"rstrip": false,
|
408 |
+
"single_word": false,
|
409 |
+
"special": true
|
410 |
+
},
|
411 |
+
"128051": {
|
412 |
+
"content": "<|reserved_special_token_43|>",
|
413 |
+
"lstrip": false,
|
414 |
+
"normalized": false,
|
415 |
+
"rstrip": false,
|
416 |
+
"single_word": false,
|
417 |
+
"special": true
|
418 |
+
},
|
419 |
+
"128052": {
|
420 |
+
"content": "<|reserved_special_token_44|>",
|
421 |
+
"lstrip": false,
|
422 |
+
"normalized": false,
|
423 |
+
"rstrip": false,
|
424 |
+
"single_word": false,
|
425 |
+
"special": true
|
426 |
+
},
|
427 |
+
"128053": {
|
428 |
+
"content": "<|reserved_special_token_45|>",
|
429 |
+
"lstrip": false,
|
430 |
+
"normalized": false,
|
431 |
+
"rstrip": false,
|
432 |
+
"single_word": false,
|
433 |
+
"special": true
|
434 |
+
},
|
435 |
+
"128054": {
|
436 |
+
"content": "<|reserved_special_token_46|>",
|
437 |
+
"lstrip": false,
|
438 |
+
"normalized": false,
|
439 |
+
"rstrip": false,
|
440 |
+
"single_word": false,
|
441 |
+
"special": true
|
442 |
+
},
|
443 |
+
"128055": {
|
444 |
+
"content": "<|reserved_special_token_47|>",
|
445 |
+
"lstrip": false,
|
446 |
+
"normalized": false,
|
447 |
+
"rstrip": false,
|
448 |
+
"single_word": false,
|
449 |
+
"special": true
|
450 |
+
},
|
451 |
+
"128056": {
|
452 |
+
"content": "<|reserved_special_token_48|>",
|
453 |
+
"lstrip": false,
|
454 |
+
"normalized": false,
|
455 |
+
"rstrip": false,
|
456 |
+
"single_word": false,
|
457 |
+
"special": true
|
458 |
+
},
|
459 |
+
"128057": {
|
460 |
+
"content": "<|reserved_special_token_49|>",
|
461 |
+
"lstrip": false,
|
462 |
+
"normalized": false,
|
463 |
+
"rstrip": false,
|
464 |
+
"single_word": false,
|
465 |
+
"special": true
|
466 |
+
},
|
467 |
+
"128058": {
|
468 |
+
"content": "<|reserved_special_token_50|>",
|
469 |
+
"lstrip": false,
|
470 |
+
"normalized": false,
|
471 |
+
"rstrip": false,
|
472 |
+
"single_word": false,
|
473 |
+
"special": true
|
474 |
+
},
|
475 |
+
"128059": {
|
476 |
+
"content": "<|reserved_special_token_51|>",
|
477 |
+
"lstrip": false,
|
478 |
+
"normalized": false,
|
479 |
+
"rstrip": false,
|
480 |
+
"single_word": false,
|
481 |
+
"special": true
|
482 |
+
},
|
483 |
+
"128060": {
|
484 |
+
"content": "<|reserved_special_token_52|>",
|
485 |
+
"lstrip": false,
|
486 |
+
"normalized": false,
|
487 |
+
"rstrip": false,
|
488 |
+
"single_word": false,
|
489 |
+
"special": true
|
490 |
+
},
|
491 |
+
"128061": {
|
492 |
+
"content": "<|reserved_special_token_53|>",
|
493 |
+
"lstrip": false,
|
494 |
+
"normalized": false,
|
495 |
+
"rstrip": false,
|
496 |
+
"single_word": false,
|
497 |
+
"special": true
|
498 |
+
},
|
499 |
+
"128062": {
|
500 |
+
"content": "<|reserved_special_token_54|>",
|
501 |
+
"lstrip": false,
|
502 |
+
"normalized": false,
|
503 |
+
"rstrip": false,
|
504 |
+
"single_word": false,
|
505 |
+
"special": true
|
506 |
+
},
|
507 |
+
"128063": {
|
508 |
+
"content": "<|reserved_special_token_55|>",
|
509 |
+
"lstrip": false,
|
510 |
+
"normalized": false,
|
511 |
+
"rstrip": false,
|
512 |
+
"single_word": false,
|
513 |
+
"special": true
|
514 |
+
},
|
515 |
+
"128064": {
|
516 |
+
"content": "<|reserved_special_token_56|>",
|
517 |
+
"lstrip": false,
|
518 |
+
"normalized": false,
|
519 |
+
"rstrip": false,
|
520 |
+
"single_word": false,
|
521 |
+
"special": true
|
522 |
+
},
|
523 |
+
"128065": {
|
524 |
+
"content": "<|reserved_special_token_57|>",
|
525 |
+
"lstrip": false,
|
526 |
+
"normalized": false,
|
527 |
+
"rstrip": false,
|
528 |
+
"single_word": false,
|
529 |
+
"special": true
|
530 |
+
},
|
531 |
+
"128066": {
|
532 |
+
"content": "<|reserved_special_token_58|>",
|
533 |
+
"lstrip": false,
|
534 |
+
"normalized": false,
|
535 |
+
"rstrip": false,
|
536 |
+
"single_word": false,
|
537 |
+
"special": true
|
538 |
+
},
|
539 |
+
"128067": {
|
540 |
+
"content": "<|reserved_special_token_59|>",
|
541 |
+
"lstrip": false,
|
542 |
+
"normalized": false,
|
543 |
+
"rstrip": false,
|
544 |
+
"single_word": false,
|
545 |
+
"special": true
|
546 |
+
},
|
547 |
+
"128068": {
|
548 |
+
"content": "<|reserved_special_token_60|>",
|
549 |
+
"lstrip": false,
|
550 |
+
"normalized": false,
|
551 |
+
"rstrip": false,
|
552 |
+
"single_word": false,
|
553 |
+
"special": true
|
554 |
+
},
|
555 |
+
"128069": {
|
556 |
+
"content": "<|reserved_special_token_61|>",
|
557 |
+
"lstrip": false,
|
558 |
+
"normalized": false,
|
559 |
+
"rstrip": false,
|
560 |
+
"single_word": false,
|
561 |
+
"special": true
|
562 |
+
},
|
563 |
+
"128070": {
|
564 |
+
"content": "<|reserved_special_token_62|>",
|
565 |
+
"lstrip": false,
|
566 |
+
"normalized": false,
|
567 |
+
"rstrip": false,
|
568 |
+
"single_word": false,
|
569 |
+
"special": true
|
570 |
+
},
|
571 |
+
"128071": {
|
572 |
+
"content": "<|reserved_special_token_63|>",
|
573 |
+
"lstrip": false,
|
574 |
+
"normalized": false,
|
575 |
+
"rstrip": false,
|
576 |
+
"single_word": false,
|
577 |
+
"special": true
|
578 |
+
},
|
579 |
+
"128072": {
|
580 |
+
"content": "<|reserved_special_token_64|>",
|
581 |
+
"lstrip": false,
|
582 |
+
"normalized": false,
|
583 |
+
"rstrip": false,
|
584 |
+
"single_word": false,
|
585 |
+
"special": true
|
586 |
+
},
|
587 |
+
"128073": {
|
588 |
+
"content": "<|reserved_special_token_65|>",
|
589 |
+
"lstrip": false,
|
590 |
+
"normalized": false,
|
591 |
+
"rstrip": false,
|
592 |
+
"single_word": false,
|
593 |
+
"special": true
|
594 |
+
},
|
595 |
+
"128074": {
|
596 |
+
"content": "<|reserved_special_token_66|>",
|
597 |
+
"lstrip": false,
|
598 |
+
"normalized": false,
|
599 |
+
"rstrip": false,
|
600 |
+
"single_word": false,
|
601 |
+
"special": true
|
602 |
+
},
|
603 |
+
"128075": {
|
604 |
+
"content": "<|reserved_special_token_67|>",
|
605 |
+
"lstrip": false,
|
606 |
+
"normalized": false,
|
607 |
+
"rstrip": false,
|
608 |
+
"single_word": false,
|
609 |
+
"special": true
|
610 |
+
},
|
611 |
+
"128076": {
|
612 |
+
"content": "<|reserved_special_token_68|>",
|
613 |
+
"lstrip": false,
|
614 |
+
"normalized": false,
|
615 |
+
"rstrip": false,
|
616 |
+
"single_word": false,
|
617 |
+
"special": true
|
618 |
+
},
|
619 |
+
"128077": {
|
620 |
+
"content": "<|reserved_special_token_69|>",
|
621 |
+
"lstrip": false,
|
622 |
+
"normalized": false,
|
623 |
+
"rstrip": false,
|
624 |
+
"single_word": false,
|
625 |
+
"special": true
|
626 |
+
},
|
627 |
+
"128078": {
|
628 |
+
"content": "<|reserved_special_token_70|>",
|
629 |
+
"lstrip": false,
|
630 |
+
"normalized": false,
|
631 |
+
"rstrip": false,
|
632 |
+
"single_word": false,
|
633 |
+
"special": true
|
634 |
+
},
|
635 |
+
"128079": {
|
636 |
+
"content": "<|reserved_special_token_71|>",
|
637 |
+
"lstrip": false,
|
638 |
+
"normalized": false,
|
639 |
+
"rstrip": false,
|
640 |
+
"single_word": false,
|
641 |
+
"special": true
|
642 |
+
},
|
643 |
+
"128080": {
|
644 |
+
"content": "<|reserved_special_token_72|>",
|
645 |
+
"lstrip": false,
|
646 |
+
"normalized": false,
|
647 |
+
"rstrip": false,
|
648 |
+
"single_word": false,
|
649 |
+
"special": true
|
650 |
+
},
|
651 |
+
"128081": {
|
652 |
+
"content": "<|reserved_special_token_73|>",
|
653 |
+
"lstrip": false,
|
654 |
+
"normalized": false,
|
655 |
+
"rstrip": false,
|
656 |
+
"single_word": false,
|
657 |
+
"special": true
|
658 |
+
},
|
659 |
+
"128082": {
|
660 |
+
"content": "<|reserved_special_token_74|>",
|
661 |
+
"lstrip": false,
|
662 |
+
"normalized": false,
|
663 |
+
"rstrip": false,
|
664 |
+
"single_word": false,
|
665 |
+
"special": true
|
666 |
+
},
|
667 |
+
"128083": {
|
668 |
+
"content": "<|reserved_special_token_75|>",
|
669 |
+
"lstrip": false,
|
670 |
+
"normalized": false,
|
671 |
+
"rstrip": false,
|
672 |
+
"single_word": false,
|
673 |
+
"special": true
|
674 |
+
},
|
675 |
+
"128084": {
|
676 |
+
"content": "<|reserved_special_token_76|>",
|
677 |
+
"lstrip": false,
|
678 |
+
"normalized": false,
|
679 |
+
"rstrip": false,
|
680 |
+
"single_word": false,
|
681 |
+
"special": true
|
682 |
+
},
|
683 |
+
"128085": {
|
684 |
+
"content": "<|reserved_special_token_77|>",
|
685 |
+
"lstrip": false,
|
686 |
+
"normalized": false,
|
687 |
+
"rstrip": false,
|
688 |
+
"single_word": false,
|
689 |
+
"special": true
|
690 |
+
},
|
691 |
+
"128086": {
|
692 |
+
"content": "<|reserved_special_token_78|>",
|
693 |
+
"lstrip": false,
|
694 |
+
"normalized": false,
|
695 |
+
"rstrip": false,
|
696 |
+
"single_word": false,
|
697 |
+
"special": true
|
698 |
+
},
|
699 |
+
"128087": {
|
700 |
+
"content": "<|reserved_special_token_79|>",
|
701 |
+
"lstrip": false,
|
702 |
+
"normalized": false,
|
703 |
+
"rstrip": false,
|
704 |
+
"single_word": false,
|
705 |
+
"special": true
|
706 |
+
},
|
707 |
+
"128088": {
|
708 |
+
"content": "<|reserved_special_token_80|>",
|
709 |
+
"lstrip": false,
|
710 |
+
"normalized": false,
|
711 |
+
"rstrip": false,
|
712 |
+
"single_word": false,
|
713 |
+
"special": true
|
714 |
+
},
|
715 |
+
"128089": {
|
716 |
+
"content": "<|reserved_special_token_81|>",
|
717 |
+
"lstrip": false,
|
718 |
+
"normalized": false,
|
719 |
+
"rstrip": false,
|
720 |
+
"single_word": false,
|
721 |
+
"special": true
|
722 |
+
},
|
723 |
+
"128090": {
|
724 |
+
"content": "<|reserved_special_token_82|>",
|
725 |
+
"lstrip": false,
|
726 |
+
"normalized": false,
|
727 |
+
"rstrip": false,
|
728 |
+
"single_word": false,
|
729 |
+
"special": true
|
730 |
+
},
|
731 |
+
"128091": {
|
732 |
+
"content": "<|reserved_special_token_83|>",
|
733 |
+
"lstrip": false,
|
734 |
+
"normalized": false,
|
735 |
+
"rstrip": false,
|
736 |
+
"single_word": false,
|
737 |
+
"special": true
|
738 |
+
},
|
739 |
+
"128092": {
|
740 |
+
"content": "<|reserved_special_token_84|>",
|
741 |
+
"lstrip": false,
|
742 |
+
"normalized": false,
|
743 |
+
"rstrip": false,
|
744 |
+
"single_word": false,
|
745 |
+
"special": true
|
746 |
+
},
|
747 |
+
"128093": {
|
748 |
+
"content": "<|reserved_special_token_85|>",
|
749 |
+
"lstrip": false,
|
750 |
+
"normalized": false,
|
751 |
+
"rstrip": false,
|
752 |
+
"single_word": false,
|
753 |
+
"special": true
|
754 |
+
},
|
755 |
+
"128094": {
|
756 |
+
"content": "<|reserved_special_token_86|>",
|
757 |
+
"lstrip": false,
|
758 |
+
"normalized": false,
|
759 |
+
"rstrip": false,
|
760 |
+
"single_word": false,
|
761 |
+
"special": true
|
762 |
+
},
|
763 |
+
"128095": {
|
764 |
+
"content": "<|reserved_special_token_87|>",
|
765 |
+
"lstrip": false,
|
766 |
+
"normalized": false,
|
767 |
+
"rstrip": false,
|
768 |
+
"single_word": false,
|
769 |
+
"special": true
|
770 |
+
},
|
771 |
+
"128096": {
|
772 |
+
"content": "<|reserved_special_token_88|>",
|
773 |
+
"lstrip": false,
|
774 |
+
"normalized": false,
|
775 |
+
"rstrip": false,
|
776 |
+
"single_word": false,
|
777 |
+
"special": true
|
778 |
+
},
|
779 |
+
"128097": {
|
780 |
+
"content": "<|reserved_special_token_89|>",
|
781 |
+
"lstrip": false,
|
782 |
+
"normalized": false,
|
783 |
+
"rstrip": false,
|
784 |
+
"single_word": false,
|
785 |
+
"special": true
|
786 |
+
},
|
787 |
+
"128098": {
|
788 |
+
"content": "<|reserved_special_token_90|>",
|
789 |
+
"lstrip": false,
|
790 |
+
"normalized": false,
|
791 |
+
"rstrip": false,
|
792 |
+
"single_word": false,
|
793 |
+
"special": true
|
794 |
+
},
|
795 |
+
"128099": {
|
796 |
+
"content": "<|reserved_special_token_91|>",
|
797 |
+
"lstrip": false,
|
798 |
+
"normalized": false,
|
799 |
+
"rstrip": false,
|
800 |
+
"single_word": false,
|
801 |
+
"special": true
|
802 |
+
},
|
803 |
+
"128100": {
|
804 |
+
"content": "<|reserved_special_token_92|>",
|
805 |
+
"lstrip": false,
|
806 |
+
"normalized": false,
|
807 |
+
"rstrip": false,
|
808 |
+
"single_word": false,
|
809 |
+
"special": true
|
810 |
+
},
|
811 |
+
"128101": {
|
812 |
+
"content": "<|reserved_special_token_93|>",
|
813 |
+
"lstrip": false,
|
814 |
+
"normalized": false,
|
815 |
+
"rstrip": false,
|
816 |
+
"single_word": false,
|
817 |
+
"special": true
|
818 |
+
},
|
819 |
+
"128102": {
|
820 |
+
"content": "<|reserved_special_token_94|>",
|
821 |
+
"lstrip": false,
|
822 |
+
"normalized": false,
|
823 |
+
"rstrip": false,
|
824 |
+
"single_word": false,
|
825 |
+
"special": true
|
826 |
+
},
|
827 |
+
"128103": {
|
828 |
+
"content": "<|reserved_special_token_95|>",
|
829 |
+
"lstrip": false,
|
830 |
+
"normalized": false,
|
831 |
+
"rstrip": false,
|
832 |
+
"single_word": false,
|
833 |
+
"special": true
|
834 |
+
},
|
835 |
+
"128104": {
|
836 |
+
"content": "<|reserved_special_token_96|>",
|
837 |
+
"lstrip": false,
|
838 |
+
"normalized": false,
|
839 |
+
"rstrip": false,
|
840 |
+
"single_word": false,
|
841 |
+
"special": true
|
842 |
+
},
|
843 |
+
"128105": {
|
844 |
+
"content": "<|reserved_special_token_97|>",
|
845 |
+
"lstrip": false,
|
846 |
+
"normalized": false,
|
847 |
+
"rstrip": false,
|
848 |
+
"single_word": false,
|
849 |
+
"special": true
|
850 |
+
},
|
851 |
+
"128106": {
|
852 |
+
"content": "<|reserved_special_token_98|>",
|
853 |
+
"lstrip": false,
|
854 |
+
"normalized": false,
|
855 |
+
"rstrip": false,
|
856 |
+
"single_word": false,
|
857 |
+
"special": true
|
858 |
+
},
|
859 |
+
"128107": {
|
860 |
+
"content": "<|reserved_special_token_99|>",
|
861 |
+
"lstrip": false,
|
862 |
+
"normalized": false,
|
863 |
+
"rstrip": false,
|
864 |
+
"single_word": false,
|
865 |
+
"special": true
|
866 |
+
},
|
867 |
+
"128108": {
|
868 |
+
"content": "<|reserved_special_token_100|>",
|
869 |
+
"lstrip": false,
|
870 |
+
"normalized": false,
|
871 |
+
"rstrip": false,
|
872 |
+
"single_word": false,
|
873 |
+
"special": true
|
874 |
+
},
|
875 |
+
"128109": {
|
876 |
+
"content": "<|reserved_special_token_101|>",
|
877 |
+
"lstrip": false,
|
878 |
+
"normalized": false,
|
879 |
+
"rstrip": false,
|
880 |
+
"single_word": false,
|
881 |
+
"special": true
|
882 |
+
},
|
883 |
+
"128110": {
|
884 |
+
"content": "<|reserved_special_token_102|>",
|
885 |
+
"lstrip": false,
|
886 |
+
"normalized": false,
|
887 |
+
"rstrip": false,
|
888 |
+
"single_word": false,
|
889 |
+
"special": true
|
890 |
+
},
|
891 |
+
"128111": {
|
892 |
+
"content": "<|reserved_special_token_103|>",
|
893 |
+
"lstrip": false,
|
894 |
+
"normalized": false,
|
895 |
+
"rstrip": false,
|
896 |
+
"single_word": false,
|
897 |
+
"special": true
|
898 |
+
},
|
899 |
+
"128112": {
|
900 |
+
"content": "<|reserved_special_token_104|>",
|
901 |
+
"lstrip": false,
|
902 |
+
"normalized": false,
|
903 |
+
"rstrip": false,
|
904 |
+
"single_word": false,
|
905 |
+
"special": true
|
906 |
+
},
|
907 |
+
"128113": {
|
908 |
+
"content": "<|reserved_special_token_105|>",
|
909 |
+
"lstrip": false,
|
910 |
+
"normalized": false,
|
911 |
+
"rstrip": false,
|
912 |
+
"single_word": false,
|
913 |
+
"special": true
|
914 |
+
},
|
915 |
+
"128114": {
|
916 |
+
"content": "<|reserved_special_token_106|>",
|
917 |
+
"lstrip": false,
|
918 |
+
"normalized": false,
|
919 |
+
"rstrip": false,
|
920 |
+
"single_word": false,
|
921 |
+
"special": true
|
922 |
+
},
|
923 |
+
"128115": {
|
924 |
+
"content": "<|reserved_special_token_107|>",
|
925 |
+
"lstrip": false,
|
926 |
+
"normalized": false,
|
927 |
+
"rstrip": false,
|
928 |
+
"single_word": false,
|
929 |
+
"special": true
|
930 |
+
},
|
931 |
+
"128116": {
|
932 |
+
"content": "<|reserved_special_token_108|>",
|
933 |
+
"lstrip": false,
|
934 |
+
"normalized": false,
|
935 |
+
"rstrip": false,
|
936 |
+
"single_word": false,
|
937 |
+
"special": true
|
938 |
+
},
|
939 |
+
"128117": {
|
940 |
+
"content": "<|reserved_special_token_109|>",
|
941 |
+
"lstrip": false,
|
942 |
+
"normalized": false,
|
943 |
+
"rstrip": false,
|
944 |
+
"single_word": false,
|
945 |
+
"special": true
|
946 |
+
},
|
947 |
+
"128118": {
|
948 |
+
"content": "<|reserved_special_token_110|>",
|
949 |
+
"lstrip": false,
|
950 |
+
"normalized": false,
|
951 |
+
"rstrip": false,
|
952 |
+
"single_word": false,
|
953 |
+
"special": true
|
954 |
+
},
|
955 |
+
"128119": {
|
956 |
+
"content": "<|reserved_special_token_111|>",
|
957 |
+
"lstrip": false,
|
958 |
+
"normalized": false,
|
959 |
+
"rstrip": false,
|
960 |
+
"single_word": false,
|
961 |
+
"special": true
|
962 |
+
},
|
963 |
+
"128120": {
|
964 |
+
"content": "<|reserved_special_token_112|>",
|
965 |
+
"lstrip": false,
|
966 |
+
"normalized": false,
|
967 |
+
"rstrip": false,
|
968 |
+
"single_word": false,
|
969 |
+
"special": true
|
970 |
+
},
|
971 |
+
"128121": {
|
972 |
+
"content": "<|reserved_special_token_113|>",
|
973 |
+
"lstrip": false,
|
974 |
+
"normalized": false,
|
975 |
+
"rstrip": false,
|
976 |
+
"single_word": false,
|
977 |
+
"special": true
|
978 |
+
},
|
979 |
+
"128122": {
|
980 |
+
"content": "<|reserved_special_token_114|>",
|
981 |
+
"lstrip": false,
|
982 |
+
"normalized": false,
|
983 |
+
"rstrip": false,
|
984 |
+
"single_word": false,
|
985 |
+
"special": true
|
986 |
+
},
|
987 |
+
"128123": {
|
988 |
+
"content": "<|reserved_special_token_115|>",
|
989 |
+
"lstrip": false,
|
990 |
+
"normalized": false,
|
991 |
+
"rstrip": false,
|
992 |
+
"single_word": false,
|
993 |
+
"special": true
|
994 |
+
},
|
995 |
+
"128124": {
|
996 |
+
"content": "<|reserved_special_token_116|>",
|
997 |
+
"lstrip": false,
|
998 |
+
"normalized": false,
|
999 |
+
"rstrip": false,
|
1000 |
+
"single_word": false,
|
1001 |
+
"special": true
|
1002 |
+
},
|
1003 |
+
"128125": {
|
1004 |
+
"content": "<|reserved_special_token_117|>",
|
1005 |
+
"lstrip": false,
|
1006 |
+
"normalized": false,
|
1007 |
+
"rstrip": false,
|
1008 |
+
"single_word": false,
|
1009 |
+
"special": true
|
1010 |
+
},
|
1011 |
+
"128126": {
|
1012 |
+
"content": "<|reserved_special_token_118|>",
|
1013 |
+
"lstrip": false,
|
1014 |
+
"normalized": false,
|
1015 |
+
"rstrip": false,
|
1016 |
+
"single_word": false,
|
1017 |
+
"special": true
|
1018 |
+
},
|
1019 |
+
"128127": {
|
1020 |
+
"content": "<|reserved_special_token_119|>",
|
1021 |
+
"lstrip": false,
|
1022 |
+
"normalized": false,
|
1023 |
+
"rstrip": false,
|
1024 |
+
"single_word": false,
|
1025 |
+
"special": true
|
1026 |
+
},
|
1027 |
+
"128128": {
|
1028 |
+
"content": "<|reserved_special_token_120|>",
|
1029 |
+
"lstrip": false,
|
1030 |
+
"normalized": false,
|
1031 |
+
"rstrip": false,
|
1032 |
+
"single_word": false,
|
1033 |
+
"special": true
|
1034 |
+
},
|
1035 |
+
"128129": {
|
1036 |
+
"content": "<|reserved_special_token_121|>",
|
1037 |
+
"lstrip": false,
|
1038 |
+
"normalized": false,
|
1039 |
+
"rstrip": false,
|
1040 |
+
"single_word": false,
|
1041 |
+
"special": true
|
1042 |
+
},
|
1043 |
+
"128130": {
|
1044 |
+
"content": "<|reserved_special_token_122|>",
|
1045 |
+
"lstrip": false,
|
1046 |
+
"normalized": false,
|
1047 |
+
"rstrip": false,
|
1048 |
+
"single_word": false,
|
1049 |
+
"special": true
|
1050 |
+
},
|
1051 |
+
"128131": {
|
1052 |
+
"content": "<|reserved_special_token_123|>",
|
1053 |
+
"lstrip": false,
|
1054 |
+
"normalized": false,
|
1055 |
+
"rstrip": false,
|
1056 |
+
"single_word": false,
|
1057 |
+
"special": true
|
1058 |
+
},
|
1059 |
+
"128132": {
|
1060 |
+
"content": "<|reserved_special_token_124|>",
|
1061 |
+
"lstrip": false,
|
1062 |
+
"normalized": false,
|
1063 |
+
"rstrip": false,
|
1064 |
+
"single_word": false,
|
1065 |
+
"special": true
|
1066 |
+
},
|
1067 |
+
"128133": {
|
1068 |
+
"content": "<|reserved_special_token_125|>",
|
1069 |
+
"lstrip": false,
|
1070 |
+
"normalized": false,
|
1071 |
+
"rstrip": false,
|
1072 |
+
"single_word": false,
|
1073 |
+
"special": true
|
1074 |
+
},
|
1075 |
+
"128134": {
|
1076 |
+
"content": "<|reserved_special_token_126|>",
|
1077 |
+
"lstrip": false,
|
1078 |
+
"normalized": false,
|
1079 |
+
"rstrip": false,
|
1080 |
+
"single_word": false,
|
1081 |
+
"special": true
|
1082 |
+
},
|
1083 |
+
"128135": {
|
1084 |
+
"content": "<|reserved_special_token_127|>",
|
1085 |
+
"lstrip": false,
|
1086 |
+
"normalized": false,
|
1087 |
+
"rstrip": false,
|
1088 |
+
"single_word": false,
|
1089 |
+
"special": true
|
1090 |
+
},
|
1091 |
+
"128136": {
|
1092 |
+
"content": "<|reserved_special_token_128|>",
|
1093 |
+
"lstrip": false,
|
1094 |
+
"normalized": false,
|
1095 |
+
"rstrip": false,
|
1096 |
+
"single_word": false,
|
1097 |
+
"special": true
|
1098 |
+
},
|
1099 |
+
"128137": {
|
1100 |
+
"content": "<|reserved_special_token_129|>",
|
1101 |
+
"lstrip": false,
|
1102 |
+
"normalized": false,
|
1103 |
+
"rstrip": false,
|
1104 |
+
"single_word": false,
|
1105 |
+
"special": true
|
1106 |
+
},
|
1107 |
+
"128138": {
|
1108 |
+
"content": "<|reserved_special_token_130|>",
|
1109 |
+
"lstrip": false,
|
1110 |
+
"normalized": false,
|
1111 |
+
"rstrip": false,
|
1112 |
+
"single_word": false,
|
1113 |
+
"special": true
|
1114 |
+
},
|
1115 |
+
"128139": {
|
1116 |
+
"content": "<|reserved_special_token_131|>",
|
1117 |
+
"lstrip": false,
|
1118 |
+
"normalized": false,
|
1119 |
+
"rstrip": false,
|
1120 |
+
"single_word": false,
|
1121 |
+
"special": true
|
1122 |
+
},
|
1123 |
+
"128140": {
|
1124 |
+
"content": "<|reserved_special_token_132|>",
|
1125 |
+
"lstrip": false,
|
1126 |
+
"normalized": false,
|
1127 |
+
"rstrip": false,
|
1128 |
+
"single_word": false,
|
1129 |
+
"special": true
|
1130 |
+
},
|
1131 |
+
"128141": {
|
1132 |
+
"content": "<|reserved_special_token_133|>",
|
1133 |
+
"lstrip": false,
|
1134 |
+
"normalized": false,
|
1135 |
+
"rstrip": false,
|
1136 |
+
"single_word": false,
|
1137 |
+
"special": true
|
1138 |
+
},
|
1139 |
+
"128142": {
|
1140 |
+
"content": "<|reserved_special_token_134|>",
|
1141 |
+
"lstrip": false,
|
1142 |
+
"normalized": false,
|
1143 |
+
"rstrip": false,
|
1144 |
+
"single_word": false,
|
1145 |
+
"special": true
|
1146 |
+
},
|
1147 |
+
"128143": {
|
1148 |
+
"content": "<|reserved_special_token_135|>",
|
1149 |
+
"lstrip": false,
|
1150 |
+
"normalized": false,
|
1151 |
+
"rstrip": false,
|
1152 |
+
"single_word": false,
|
1153 |
+
"special": true
|
1154 |
+
},
|
1155 |
+
"128144": {
|
1156 |
+
"content": "<|reserved_special_token_136|>",
|
1157 |
+
"lstrip": false,
|
1158 |
+
"normalized": false,
|
1159 |
+
"rstrip": false,
|
1160 |
+
"single_word": false,
|
1161 |
+
"special": true
|
1162 |
+
},
|
1163 |
+
"128145": {
|
1164 |
+
"content": "<|reserved_special_token_137|>",
|
1165 |
+
"lstrip": false,
|
1166 |
+
"normalized": false,
|
1167 |
+
"rstrip": false,
|
1168 |
+
"single_word": false,
|
1169 |
+
"special": true
|
1170 |
+
},
|
1171 |
+
"128146": {
|
1172 |
+
"content": "<|reserved_special_token_138|>",
|
1173 |
+
"lstrip": false,
|
1174 |
+
"normalized": false,
|
1175 |
+
"rstrip": false,
|
1176 |
+
"single_word": false,
|
1177 |
+
"special": true
|
1178 |
+
},
|
1179 |
+
"128147": {
|
1180 |
+
"content": "<|reserved_special_token_139|>",
|
1181 |
+
"lstrip": false,
|
1182 |
+
"normalized": false,
|
1183 |
+
"rstrip": false,
|
1184 |
+
"single_word": false,
|
1185 |
+
"special": true
|
1186 |
+
},
|
1187 |
+
"128148": {
|
1188 |
+
"content": "<|reserved_special_token_140|>",
|
1189 |
+
"lstrip": false,
|
1190 |
+
"normalized": false,
|
1191 |
+
"rstrip": false,
|
1192 |
+
"single_word": false,
|
1193 |
+
"special": true
|
1194 |
+
},
|
1195 |
+
"128149": {
|
1196 |
+
"content": "<|reserved_special_token_141|>",
|
1197 |
+
"lstrip": false,
|
1198 |
+
"normalized": false,
|
1199 |
+
"rstrip": false,
|
1200 |
+
"single_word": false,
|
1201 |
+
"special": true
|
1202 |
+
},
|
1203 |
+
"128150": {
|
1204 |
+
"content": "<|reserved_special_token_142|>",
|
1205 |
+
"lstrip": false,
|
1206 |
+
"normalized": false,
|
1207 |
+
"rstrip": false,
|
1208 |
+
"single_word": false,
|
1209 |
+
"special": true
|
1210 |
+
},
|
1211 |
+
"128151": {
|
1212 |
+
"content": "<|reserved_special_token_143|>",
|
1213 |
+
"lstrip": false,
|
1214 |
+
"normalized": false,
|
1215 |
+
"rstrip": false,
|
1216 |
+
"single_word": false,
|
1217 |
+
"special": true
|
1218 |
+
},
|
1219 |
+
"128152": {
|
1220 |
+
"content": "<|reserved_special_token_144|>",
|
1221 |
+
"lstrip": false,
|
1222 |
+
"normalized": false,
|
1223 |
+
"rstrip": false,
|
1224 |
+
"single_word": false,
|
1225 |
+
"special": true
|
1226 |
+
},
|
1227 |
+
"128153": {
|
1228 |
+
"content": "<|reserved_special_token_145|>",
|
1229 |
+
"lstrip": false,
|
1230 |
+
"normalized": false,
|
1231 |
+
"rstrip": false,
|
1232 |
+
"single_word": false,
|
1233 |
+
"special": true
|
1234 |
+
},
|
1235 |
+
"128154": {
|
1236 |
+
"content": "<|reserved_special_token_146|>",
|
1237 |
+
"lstrip": false,
|
1238 |
+
"normalized": false,
|
1239 |
+
"rstrip": false,
|
1240 |
+
"single_word": false,
|
1241 |
+
"special": true
|
1242 |
+
},
|
1243 |
+
"128155": {
|
1244 |
+
"content": "<|reserved_special_token_147|>",
|
1245 |
+
"lstrip": false,
|
1246 |
+
"normalized": false,
|
1247 |
+
"rstrip": false,
|
1248 |
+
"single_word": false,
|
1249 |
+
"special": true
|
1250 |
+
},
|
1251 |
+
"128156": {
|
1252 |
+
"content": "<|reserved_special_token_148|>",
|
1253 |
+
"lstrip": false,
|
1254 |
+
"normalized": false,
|
1255 |
+
"rstrip": false,
|
1256 |
+
"single_word": false,
|
1257 |
+
"special": true
|
1258 |
+
},
|
1259 |
+
"128157": {
|
1260 |
+
"content": "<|reserved_special_token_149|>",
|
1261 |
+
"lstrip": false,
|
1262 |
+
"normalized": false,
|
1263 |
+
"rstrip": false,
|
1264 |
+
"single_word": false,
|
1265 |
+
"special": true
|
1266 |
+
},
|
1267 |
+
"128158": {
|
1268 |
+
"content": "<|reserved_special_token_150|>",
|
1269 |
+
"lstrip": false,
|
1270 |
+
"normalized": false,
|
1271 |
+
"rstrip": false,
|
1272 |
+
"single_word": false,
|
1273 |
+
"special": true
|
1274 |
+
},
|
1275 |
+
"128159": {
|
1276 |
+
"content": "<|reserved_special_token_151|>",
|
1277 |
+
"lstrip": false,
|
1278 |
+
"normalized": false,
|
1279 |
+
"rstrip": false,
|
1280 |
+
"single_word": false,
|
1281 |
+
"special": true
|
1282 |
+
},
|
1283 |
+
"128160": {
|
1284 |
+
"content": "<|reserved_special_token_152|>",
|
1285 |
+
"lstrip": false,
|
1286 |
+
"normalized": false,
|
1287 |
+
"rstrip": false,
|
1288 |
+
"single_word": false,
|
1289 |
+
"special": true
|
1290 |
+
},
|
1291 |
+
"128161": {
|
1292 |
+
"content": "<|reserved_special_token_153|>",
|
1293 |
+
"lstrip": false,
|
1294 |
+
"normalized": false,
|
1295 |
+
"rstrip": false,
|
1296 |
+
"single_word": false,
|
1297 |
+
"special": true
|
1298 |
+
},
|
1299 |
+
"128162": {
|
1300 |
+
"content": "<|reserved_special_token_154|>",
|
1301 |
+
"lstrip": false,
|
1302 |
+
"normalized": false,
|
1303 |
+
"rstrip": false,
|
1304 |
+
"single_word": false,
|
1305 |
+
"special": true
|
1306 |
+
},
|
1307 |
+
"128163": {
|
1308 |
+
"content": "<|reserved_special_token_155|>",
|
1309 |
+
"lstrip": false,
|
1310 |
+
"normalized": false,
|
1311 |
+
"rstrip": false,
|
1312 |
+
"single_word": false,
|
1313 |
+
"special": true
|
1314 |
+
},
|
1315 |
+
"128164": {
|
1316 |
+
"content": "<|reserved_special_token_156|>",
|
1317 |
+
"lstrip": false,
|
1318 |
+
"normalized": false,
|
1319 |
+
"rstrip": false,
|
1320 |
+
"single_word": false,
|
1321 |
+
"special": true
|
1322 |
+
},
|
1323 |
+
"128165": {
|
1324 |
+
"content": "<|reserved_special_token_157|>",
|
1325 |
+
"lstrip": false,
|
1326 |
+
"normalized": false,
|
1327 |
+
"rstrip": false,
|
1328 |
+
"single_word": false,
|
1329 |
+
"special": true
|
1330 |
+
},
|
1331 |
+
"128166": {
|
1332 |
+
"content": "<|reserved_special_token_158|>",
|
1333 |
+
"lstrip": false,
|
1334 |
+
"normalized": false,
|
1335 |
+
"rstrip": false,
|
1336 |
+
"single_word": false,
|
1337 |
+
"special": true
|
1338 |
+
},
|
1339 |
+
"128167": {
|
1340 |
+
"content": "<|reserved_special_token_159|>",
|
1341 |
+
"lstrip": false,
|
1342 |
+
"normalized": false,
|
1343 |
+
"rstrip": false,
|
1344 |
+
"single_word": false,
|
1345 |
+
"special": true
|
1346 |
+
},
|
1347 |
+
"128168": {
|
1348 |
+
"content": "<|reserved_special_token_160|>",
|
1349 |
+
"lstrip": false,
|
1350 |
+
"normalized": false,
|
1351 |
+
"rstrip": false,
|
1352 |
+
"single_word": false,
|
1353 |
+
"special": true
|
1354 |
+
},
|
1355 |
+
"128169": {
|
1356 |
+
"content": "<|reserved_special_token_161|>",
|
1357 |
+
"lstrip": false,
|
1358 |
+
"normalized": false,
|
1359 |
+
"rstrip": false,
|
1360 |
+
"single_word": false,
|
1361 |
+
"special": true
|
1362 |
+
},
|
1363 |
+
"128170": {
|
1364 |
+
"content": "<|reserved_special_token_162|>",
|
1365 |
+
"lstrip": false,
|
1366 |
+
"normalized": false,
|
1367 |
+
"rstrip": false,
|
1368 |
+
"single_word": false,
|
1369 |
+
"special": true
|
1370 |
+
},
|
1371 |
+
"128171": {
|
1372 |
+
"content": "<|reserved_special_token_163|>",
|
1373 |
+
"lstrip": false,
|
1374 |
+
"normalized": false,
|
1375 |
+
"rstrip": false,
|
1376 |
+
"single_word": false,
|
1377 |
+
"special": true
|
1378 |
+
},
|
1379 |
+
"128172": {
|
1380 |
+
"content": "<|reserved_special_token_164|>",
|
1381 |
+
"lstrip": false,
|
1382 |
+
"normalized": false,
|
1383 |
+
"rstrip": false,
|
1384 |
+
"single_word": false,
|
1385 |
+
"special": true
|
1386 |
+
},
|
1387 |
+
"128173": {
|
1388 |
+
"content": "<|reserved_special_token_165|>",
|
1389 |
+
"lstrip": false,
|
1390 |
+
"normalized": false,
|
1391 |
+
"rstrip": false,
|
1392 |
+
"single_word": false,
|
1393 |
+
"special": true
|
1394 |
+
},
|
1395 |
+
"128174": {
|
1396 |
+
"content": "<|reserved_special_token_166|>",
|
1397 |
+
"lstrip": false,
|
1398 |
+
"normalized": false,
|
1399 |
+
"rstrip": false,
|
1400 |
+
"single_word": false,
|
1401 |
+
"special": true
|
1402 |
+
},
|
1403 |
+
"128175": {
|
1404 |
+
"content": "<|reserved_special_token_167|>",
|
1405 |
+
"lstrip": false,
|
1406 |
+
"normalized": false,
|
1407 |
+
"rstrip": false,
|
1408 |
+
"single_word": false,
|
1409 |
+
"special": true
|
1410 |
+
},
|
1411 |
+
"128176": {
|
1412 |
+
"content": "<|reserved_special_token_168|>",
|
1413 |
+
"lstrip": false,
|
1414 |
+
"normalized": false,
|
1415 |
+
"rstrip": false,
|
1416 |
+
"single_word": false,
|
1417 |
+
"special": true
|
1418 |
+
},
|
1419 |
+
"128177": {
|
1420 |
+
"content": "<|reserved_special_token_169|>",
|
1421 |
+
"lstrip": false,
|
1422 |
+
"normalized": false,
|
1423 |
+
"rstrip": false,
|
1424 |
+
"single_word": false,
|
1425 |
+
"special": true
|
1426 |
+
},
|
1427 |
+
"128178": {
|
1428 |
+
"content": "<|reserved_special_token_170|>",
|
1429 |
+
"lstrip": false,
|
1430 |
+
"normalized": false,
|
1431 |
+
"rstrip": false,
|
1432 |
+
"single_word": false,
|
1433 |
+
"special": true
|
1434 |
+
},
|
1435 |
+
"128179": {
|
1436 |
+
"content": "<|reserved_special_token_171|>",
|
1437 |
+
"lstrip": false,
|
1438 |
+
"normalized": false,
|
1439 |
+
"rstrip": false,
|
1440 |
+
"single_word": false,
|
1441 |
+
"special": true
|
1442 |
+
},
|
1443 |
+
"128180": {
|
1444 |
+
"content": "<|reserved_special_token_172|>",
|
1445 |
+
"lstrip": false,
|
1446 |
+
"normalized": false,
|
1447 |
+
"rstrip": false,
|
1448 |
+
"single_word": false,
|
1449 |
+
"special": true
|
1450 |
+
},
|
1451 |
+
"128181": {
|
1452 |
+
"content": "<|reserved_special_token_173|>",
|
1453 |
+
"lstrip": false,
|
1454 |
+
"normalized": false,
|
1455 |
+
"rstrip": false,
|
1456 |
+
"single_word": false,
|
1457 |
+
"special": true
|
1458 |
+
},
|
1459 |
+
"128182": {
|
1460 |
+
"content": "<|reserved_special_token_174|>",
|
1461 |
+
"lstrip": false,
|
1462 |
+
"normalized": false,
|
1463 |
+
"rstrip": false,
|
1464 |
+
"single_word": false,
|
1465 |
+
"special": true
|
1466 |
+
},
|
1467 |
+
"128183": {
|
1468 |
+
"content": "<|reserved_special_token_175|>",
|
1469 |
+
"lstrip": false,
|
1470 |
+
"normalized": false,
|
1471 |
+
"rstrip": false,
|
1472 |
+
"single_word": false,
|
1473 |
+
"special": true
|
1474 |
+
},
|
1475 |
+
"128184": {
|
1476 |
+
"content": "<|reserved_special_token_176|>",
|
1477 |
+
"lstrip": false,
|
1478 |
+
"normalized": false,
|
1479 |
+
"rstrip": false,
|
1480 |
+
"single_word": false,
|
1481 |
+
"special": true
|
1482 |
+
},
|
1483 |
+
"128185": {
|
1484 |
+
"content": "<|reserved_special_token_177|>",
|
1485 |
+
"lstrip": false,
|
1486 |
+
"normalized": false,
|
1487 |
+
"rstrip": false,
|
1488 |
+
"single_word": false,
|
1489 |
+
"special": true
|
1490 |
+
},
|
1491 |
+
"128186": {
|
1492 |
+
"content": "<|reserved_special_token_178|>",
|
1493 |
+
"lstrip": false,
|
1494 |
+
"normalized": false,
|
1495 |
+
"rstrip": false,
|
1496 |
+
"single_word": false,
|
1497 |
+
"special": true
|
1498 |
+
},
|
1499 |
+
"128187": {
|
1500 |
+
"content": "<|reserved_special_token_179|>",
|
1501 |
+
"lstrip": false,
|
1502 |
+
"normalized": false,
|
1503 |
+
"rstrip": false,
|
1504 |
+
"single_word": false,
|
1505 |
+
"special": true
|
1506 |
+
},
|
1507 |
+
"128188": {
|
1508 |
+
"content": "<|reserved_special_token_180|>",
|
1509 |
+
"lstrip": false,
|
1510 |
+
"normalized": false,
|
1511 |
+
"rstrip": false,
|
1512 |
+
"single_word": false,
|
1513 |
+
"special": true
|
1514 |
+
},
|
1515 |
+
"128189": {
|
1516 |
+
"content": "<|reserved_special_token_181|>",
|
1517 |
+
"lstrip": false,
|
1518 |
+
"normalized": false,
|
1519 |
+
"rstrip": false,
|
1520 |
+
"single_word": false,
|
1521 |
+
"special": true
|
1522 |
+
},
|
1523 |
+
"128190": {
|
1524 |
+
"content": "<|reserved_special_token_182|>",
|
1525 |
+
"lstrip": false,
|
1526 |
+
"normalized": false,
|
1527 |
+
"rstrip": false,
|
1528 |
+
"single_word": false,
|
1529 |
+
"special": true
|
1530 |
+
},
|
1531 |
+
"128191": {
|
1532 |
+
"content": "<|reserved_special_token_183|>",
|
1533 |
+
"lstrip": false,
|
1534 |
+
"normalized": false,
|
1535 |
+
"rstrip": false,
|
1536 |
+
"single_word": false,
|
1537 |
+
"special": true
|
1538 |
+
},
|
1539 |
+
"128192": {
|
1540 |
+
"content": "<|reserved_special_token_184|>",
|
1541 |
+
"lstrip": false,
|
1542 |
+
"normalized": false,
|
1543 |
+
"rstrip": false,
|
1544 |
+
"single_word": false,
|
1545 |
+
"special": true
|
1546 |
+
},
|
1547 |
+
"128193": {
|
1548 |
+
"content": "<|reserved_special_token_185|>",
|
1549 |
+
"lstrip": false,
|
1550 |
+
"normalized": false,
|
1551 |
+
"rstrip": false,
|
1552 |
+
"single_word": false,
|
1553 |
+
"special": true
|
1554 |
+
},
|
1555 |
+
"128194": {
|
1556 |
+
"content": "<|reserved_special_token_186|>",
|
1557 |
+
"lstrip": false,
|
1558 |
+
"normalized": false,
|
1559 |
+
"rstrip": false,
|
1560 |
+
"single_word": false,
|
1561 |
+
"special": true
|
1562 |
+
},
|
1563 |
+
"128195": {
|
1564 |
+
"content": "<|reserved_special_token_187|>",
|
1565 |
+
"lstrip": false,
|
1566 |
+
"normalized": false,
|
1567 |
+
"rstrip": false,
|
1568 |
+
"single_word": false,
|
1569 |
+
"special": true
|
1570 |
+
},
|
1571 |
+
"128196": {
|
1572 |
+
"content": "<|reserved_special_token_188|>",
|
1573 |
+
"lstrip": false,
|
1574 |
+
"normalized": false,
|
1575 |
+
"rstrip": false,
|
1576 |
+
"single_word": false,
|
1577 |
+
"special": true
|
1578 |
+
},
|
1579 |
+
"128197": {
|
1580 |
+
"content": "<|reserved_special_token_189|>",
|
1581 |
+
"lstrip": false,
|
1582 |
+
"normalized": false,
|
1583 |
+
"rstrip": false,
|
1584 |
+
"single_word": false,
|
1585 |
+
"special": true
|
1586 |
+
},
|
1587 |
+
"128198": {
|
1588 |
+
"content": "<|reserved_special_token_190|>",
|
1589 |
+
"lstrip": false,
|
1590 |
+
"normalized": false,
|
1591 |
+
"rstrip": false,
|
1592 |
+
"single_word": false,
|
1593 |
+
"special": true
|
1594 |
+
},
|
1595 |
+
"128199": {
|
1596 |
+
"content": "<|reserved_special_token_191|>",
|
1597 |
+
"lstrip": false,
|
1598 |
+
"normalized": false,
|
1599 |
+
"rstrip": false,
|
1600 |
+
"single_word": false,
|
1601 |
+
"special": true
|
1602 |
+
},
|
1603 |
+
"128200": {
|
1604 |
+
"content": "<|reserved_special_token_192|>",
|
1605 |
+
"lstrip": false,
|
1606 |
+
"normalized": false,
|
1607 |
+
"rstrip": false,
|
1608 |
+
"single_word": false,
|
1609 |
+
"special": true
|
1610 |
+
},
|
1611 |
+
"128201": {
|
1612 |
+
"content": "<|reserved_special_token_193|>",
|
1613 |
+
"lstrip": false,
|
1614 |
+
"normalized": false,
|
1615 |
+
"rstrip": false,
|
1616 |
+
"single_word": false,
|
1617 |
+
"special": true
|
1618 |
+
},
|
1619 |
+
"128202": {
|
1620 |
+
"content": "<|reserved_special_token_194|>",
|
1621 |
+
"lstrip": false,
|
1622 |
+
"normalized": false,
|
1623 |
+
"rstrip": false,
|
1624 |
+
"single_word": false,
|
1625 |
+
"special": true
|
1626 |
+
},
|
1627 |
+
"128203": {
|
1628 |
+
"content": "<|reserved_special_token_195|>",
|
1629 |
+
"lstrip": false,
|
1630 |
+
"normalized": false,
|
1631 |
+
"rstrip": false,
|
1632 |
+
"single_word": false,
|
1633 |
+
"special": true
|
1634 |
+
},
|
1635 |
+
"128204": {
|
1636 |
+
"content": "<|reserved_special_token_196|>",
|
1637 |
+
"lstrip": false,
|
1638 |
+
"normalized": false,
|
1639 |
+
"rstrip": false,
|
1640 |
+
"single_word": false,
|
1641 |
+
"special": true
|
1642 |
+
},
|
1643 |
+
"128205": {
|
1644 |
+
"content": "<|reserved_special_token_197|>",
|
1645 |
+
"lstrip": false,
|
1646 |
+
"normalized": false,
|
1647 |
+
"rstrip": false,
|
1648 |
+
"single_word": false,
|
1649 |
+
"special": true
|
1650 |
+
},
|
1651 |
+
"128206": {
|
1652 |
+
"content": "<|reserved_special_token_198|>",
|
1653 |
+
"lstrip": false,
|
1654 |
+
"normalized": false,
|
1655 |
+
"rstrip": false,
|
1656 |
+
"single_word": false,
|
1657 |
+
"special": true
|
1658 |
+
},
|
1659 |
+
"128207": {
|
1660 |
+
"content": "<|reserved_special_token_199|>",
|
1661 |
+
"lstrip": false,
|
1662 |
+
"normalized": false,
|
1663 |
+
"rstrip": false,
|
1664 |
+
"single_word": false,
|
1665 |
+
"special": true
|
1666 |
+
},
|
1667 |
+
"128208": {
|
1668 |
+
"content": "<|reserved_special_token_200|>",
|
1669 |
+
"lstrip": false,
|
1670 |
+
"normalized": false,
|
1671 |
+
"rstrip": false,
|
1672 |
+
"single_word": false,
|
1673 |
+
"special": true
|
1674 |
+
},
|
1675 |
+
"128209": {
|
1676 |
+
"content": "<|reserved_special_token_201|>",
|
1677 |
+
"lstrip": false,
|
1678 |
+
"normalized": false,
|
1679 |
+
"rstrip": false,
|
1680 |
+
"single_word": false,
|
1681 |
+
"special": true
|
1682 |
+
},
|
1683 |
+
"128210": {
|
1684 |
+
"content": "<|reserved_special_token_202|>",
|
1685 |
+
"lstrip": false,
|
1686 |
+
"normalized": false,
|
1687 |
+
"rstrip": false,
|
1688 |
+
"single_word": false,
|
1689 |
+
"special": true
|
1690 |
+
},
|
1691 |
+
"128211": {
|
1692 |
+
"content": "<|reserved_special_token_203|>",
|
1693 |
+
"lstrip": false,
|
1694 |
+
"normalized": false,
|
1695 |
+
"rstrip": false,
|
1696 |
+
"single_word": false,
|
1697 |
+
"special": true
|
1698 |
+
},
|
1699 |
+
"128212": {
|
1700 |
+
"content": "<|reserved_special_token_204|>",
|
1701 |
+
"lstrip": false,
|
1702 |
+
"normalized": false,
|
1703 |
+
"rstrip": false,
|
1704 |
+
"single_word": false,
|
1705 |
+
"special": true
|
1706 |
+
},
|
1707 |
+
"128213": {
|
1708 |
+
"content": "<|reserved_special_token_205|>",
|
1709 |
+
"lstrip": false,
|
1710 |
+
"normalized": false,
|
1711 |
+
"rstrip": false,
|
1712 |
+
"single_word": false,
|
1713 |
+
"special": true
|
1714 |
+
},
|
1715 |
+
"128214": {
|
1716 |
+
"content": "<|reserved_special_token_206|>",
|
1717 |
+
"lstrip": false,
|
1718 |
+
"normalized": false,
|
1719 |
+
"rstrip": false,
|
1720 |
+
"single_word": false,
|
1721 |
+
"special": true
|
1722 |
+
},
|
1723 |
+
"128215": {
|
1724 |
+
"content": "<|reserved_special_token_207|>",
|
1725 |
+
"lstrip": false,
|
1726 |
+
"normalized": false,
|
1727 |
+
"rstrip": false,
|
1728 |
+
"single_word": false,
|
1729 |
+
"special": true
|
1730 |
+
},
|
1731 |
+
"128216": {
|
1732 |
+
"content": "<|reserved_special_token_208|>",
|
1733 |
+
"lstrip": false,
|
1734 |
+
"normalized": false,
|
1735 |
+
"rstrip": false,
|
1736 |
+
"single_word": false,
|
1737 |
+
"special": true
|
1738 |
+
},
|
1739 |
+
"128217": {
|
1740 |
+
"content": "<|reserved_special_token_209|>",
|
1741 |
+
"lstrip": false,
|
1742 |
+
"normalized": false,
|
1743 |
+
"rstrip": false,
|
1744 |
+
"single_word": false,
|
1745 |
+
"special": true
|
1746 |
+
},
|
1747 |
+
"128218": {
|
1748 |
+
"content": "<|reserved_special_token_210|>",
|
1749 |
+
"lstrip": false,
|
1750 |
+
"normalized": false,
|
1751 |
+
"rstrip": false,
|
1752 |
+
"single_word": false,
|
1753 |
+
"special": true
|
1754 |
+
},
|
1755 |
+
"128219": {
|
1756 |
+
"content": "<|reserved_special_token_211|>",
|
1757 |
+
"lstrip": false,
|
1758 |
+
"normalized": false,
|
1759 |
+
"rstrip": false,
|
1760 |
+
"single_word": false,
|
1761 |
+
"special": true
|
1762 |
+
},
|
1763 |
+
"128220": {
|
1764 |
+
"content": "<|reserved_special_token_212|>",
|
1765 |
+
"lstrip": false,
|
1766 |
+
"normalized": false,
|
1767 |
+
"rstrip": false,
|
1768 |
+
"single_word": false,
|
1769 |
+
"special": true
|
1770 |
+
},
|
1771 |
+
"128221": {
|
1772 |
+
"content": "<|reserved_special_token_213|>",
|
1773 |
+
"lstrip": false,
|
1774 |
+
"normalized": false,
|
1775 |
+
"rstrip": false,
|
1776 |
+
"single_word": false,
|
1777 |
+
"special": true
|
1778 |
+
},
|
1779 |
+
"128222": {
|
1780 |
+
"content": "<|reserved_special_token_214|>",
|
1781 |
+
"lstrip": false,
|
1782 |
+
"normalized": false,
|
1783 |
+
"rstrip": false,
|
1784 |
+
"single_word": false,
|
1785 |
+
"special": true
|
1786 |
+
},
|
1787 |
+
"128223": {
|
1788 |
+
"content": "<|reserved_special_token_215|>",
|
1789 |
+
"lstrip": false,
|
1790 |
+
"normalized": false,
|
1791 |
+
"rstrip": false,
|
1792 |
+
"single_word": false,
|
1793 |
+
"special": true
|
1794 |
+
},
|
1795 |
+
"128224": {
|
1796 |
+
"content": "<|reserved_special_token_216|>",
|
1797 |
+
"lstrip": false,
|
1798 |
+
"normalized": false,
|
1799 |
+
"rstrip": false,
|
1800 |
+
"single_word": false,
|
1801 |
+
"special": true
|
1802 |
+
},
|
1803 |
+
"128225": {
|
1804 |
+
"content": "<|reserved_special_token_217|>",
|
1805 |
+
"lstrip": false,
|
1806 |
+
"normalized": false,
|
1807 |
+
"rstrip": false,
|
1808 |
+
"single_word": false,
|
1809 |
+
"special": true
|
1810 |
+
},
|
1811 |
+
"128226": {
|
1812 |
+
"content": "<|reserved_special_token_218|>",
|
1813 |
+
"lstrip": false,
|
1814 |
+
"normalized": false,
|
1815 |
+
"rstrip": false,
|
1816 |
+
"single_word": false,
|
1817 |
+
"special": true
|
1818 |
+
},
|
1819 |
+
"128227": {
|
1820 |
+
"content": "<|reserved_special_token_219|>",
|
1821 |
+
"lstrip": false,
|
1822 |
+
"normalized": false,
|
1823 |
+
"rstrip": false,
|
1824 |
+
"single_word": false,
|
1825 |
+
"special": true
|
1826 |
+
},
|
1827 |
+
"128228": {
|
1828 |
+
"content": "<|reserved_special_token_220|>",
|
1829 |
+
"lstrip": false,
|
1830 |
+
"normalized": false,
|
1831 |
+
"rstrip": false,
|
1832 |
+
"single_word": false,
|
1833 |
+
"special": true
|
1834 |
+
},
|
1835 |
+
"128229": {
|
1836 |
+
"content": "<|reserved_special_token_221|>",
|
1837 |
+
"lstrip": false,
|
1838 |
+
"normalized": false,
|
1839 |
+
"rstrip": false,
|
1840 |
+
"single_word": false,
|
1841 |
+
"special": true
|
1842 |
+
},
|
1843 |
+
"128230": {
|
1844 |
+
"content": "<|reserved_special_token_222|>",
|
1845 |
+
"lstrip": false,
|
1846 |
+
"normalized": false,
|
1847 |
+
"rstrip": false,
|
1848 |
+
"single_word": false,
|
1849 |
+
"special": true
|
1850 |
+
},
|
1851 |
+
"128231": {
|
1852 |
+
"content": "<|reserved_special_token_223|>",
|
1853 |
+
"lstrip": false,
|
1854 |
+
"normalized": false,
|
1855 |
+
"rstrip": false,
|
1856 |
+
"single_word": false,
|
1857 |
+
"special": true
|
1858 |
+
},
|
1859 |
+
"128232": {
|
1860 |
+
"content": "<|reserved_special_token_224|>",
|
1861 |
+
"lstrip": false,
|
1862 |
+
"normalized": false,
|
1863 |
+
"rstrip": false,
|
1864 |
+
"single_word": false,
|
1865 |
+
"special": true
|
1866 |
+
},
|
1867 |
+
"128233": {
|
1868 |
+
"content": "<|reserved_special_token_225|>",
|
1869 |
+
"lstrip": false,
|
1870 |
+
"normalized": false,
|
1871 |
+
"rstrip": false,
|
1872 |
+
"single_word": false,
|
1873 |
+
"special": true
|
1874 |
+
},
|
1875 |
+
"128234": {
|
1876 |
+
"content": "<|reserved_special_token_226|>",
|
1877 |
+
"lstrip": false,
|
1878 |
+
"normalized": false,
|
1879 |
+
"rstrip": false,
|
1880 |
+
"single_word": false,
|
1881 |
+
"special": true
|
1882 |
+
},
|
1883 |
+
"128235": {
|
1884 |
+
"content": "<|reserved_special_token_227|>",
|
1885 |
+
"lstrip": false,
|
1886 |
+
"normalized": false,
|
1887 |
+
"rstrip": false,
|
1888 |
+
"single_word": false,
|
1889 |
+
"special": true
|
1890 |
+
},
|
1891 |
+
"128236": {
|
1892 |
+
"content": "<|reserved_special_token_228|>",
|
1893 |
+
"lstrip": false,
|
1894 |
+
"normalized": false,
|
1895 |
+
"rstrip": false,
|
1896 |
+
"single_word": false,
|
1897 |
+
"special": true
|
1898 |
+
},
|
1899 |
+
"128237": {
|
1900 |
+
"content": "<|reserved_special_token_229|>",
|
1901 |
+
"lstrip": false,
|
1902 |
+
"normalized": false,
|
1903 |
+
"rstrip": false,
|
1904 |
+
"single_word": false,
|
1905 |
+
"special": true
|
1906 |
+
},
|
1907 |
+
"128238": {
|
1908 |
+
"content": "<|reserved_special_token_230|>",
|
1909 |
+
"lstrip": false,
|
1910 |
+
"normalized": false,
|
1911 |
+
"rstrip": false,
|
1912 |
+
"single_word": false,
|
1913 |
+
"special": true
|
1914 |
+
},
|
1915 |
+
"128239": {
|
1916 |
+
"content": "<|reserved_special_token_231|>",
|
1917 |
+
"lstrip": false,
|
1918 |
+
"normalized": false,
|
1919 |
+
"rstrip": false,
|
1920 |
+
"single_word": false,
|
1921 |
+
"special": true
|
1922 |
+
},
|
1923 |
+
"128240": {
|
1924 |
+
"content": "<|reserved_special_token_232|>",
|
1925 |
+
"lstrip": false,
|
1926 |
+
"normalized": false,
|
1927 |
+
"rstrip": false,
|
1928 |
+
"single_word": false,
|
1929 |
+
"special": true
|
1930 |
+
},
|
1931 |
+
"128241": {
|
1932 |
+
"content": "<|reserved_special_token_233|>",
|
1933 |
+
"lstrip": false,
|
1934 |
+
"normalized": false,
|
1935 |
+
"rstrip": false,
|
1936 |
+
"single_word": false,
|
1937 |
+
"special": true
|
1938 |
+
},
|
1939 |
+
"128242": {
|
1940 |
+
"content": "<|reserved_special_token_234|>",
|
1941 |
+
"lstrip": false,
|
1942 |
+
"normalized": false,
|
1943 |
+
"rstrip": false,
|
1944 |
+
"single_word": false,
|
1945 |
+
"special": true
|
1946 |
+
},
|
1947 |
+
"128243": {
|
1948 |
+
"content": "<|reserved_special_token_235|>",
|
1949 |
+
"lstrip": false,
|
1950 |
+
"normalized": false,
|
1951 |
+
"rstrip": false,
|
1952 |
+
"single_word": false,
|
1953 |
+
"special": true
|
1954 |
+
},
|
1955 |
+
"128244": {
|
1956 |
+
"content": "<|reserved_special_token_236|>",
|
1957 |
+
"lstrip": false,
|
1958 |
+
"normalized": false,
|
1959 |
+
"rstrip": false,
|
1960 |
+
"single_word": false,
|
1961 |
+
"special": true
|
1962 |
+
},
|
1963 |
+
"128245": {
|
1964 |
+
"content": "<|reserved_special_token_237|>",
|
1965 |
+
"lstrip": false,
|
1966 |
+
"normalized": false,
|
1967 |
+
"rstrip": false,
|
1968 |
+
"single_word": false,
|
1969 |
+
"special": true
|
1970 |
+
},
|
1971 |
+
"128246": {
|
1972 |
+
"content": "<|reserved_special_token_238|>",
|
1973 |
+
"lstrip": false,
|
1974 |
+
"normalized": false,
|
1975 |
+
"rstrip": false,
|
1976 |
+
"single_word": false,
|
1977 |
+
"special": true
|
1978 |
+
},
|
1979 |
+
"128247": {
|
1980 |
+
"content": "<|reserved_special_token_239|>",
|
1981 |
+
"lstrip": false,
|
1982 |
+
"normalized": false,
|
1983 |
+
"rstrip": false,
|
1984 |
+
"single_word": false,
|
1985 |
+
"special": true
|
1986 |
+
},
|
1987 |
+
"128248": {
|
1988 |
+
"content": "<|reserved_special_token_240|>",
|
1989 |
+
"lstrip": false,
|
1990 |
+
"normalized": false,
|
1991 |
+
"rstrip": false,
|
1992 |
+
"single_word": false,
|
1993 |
+
"special": true
|
1994 |
+
},
|
1995 |
+
"128249": {
|
1996 |
+
"content": "<|reserved_special_token_241|>",
|
1997 |
+
"lstrip": false,
|
1998 |
+
"normalized": false,
|
1999 |
+
"rstrip": false,
|
2000 |
+
"single_word": false,
|
2001 |
+
"special": true
|
2002 |
+
},
|
2003 |
+
"128250": {
|
2004 |
+
"content": "<|reserved_special_token_242|>",
|
2005 |
+
"lstrip": false,
|
2006 |
+
"normalized": false,
|
2007 |
+
"rstrip": false,
|
2008 |
+
"single_word": false,
|
2009 |
+
"special": true
|
2010 |
+
},
|
2011 |
+
"128251": {
|
2012 |
+
"content": "<|reserved_special_token_243|>",
|
2013 |
+
"lstrip": false,
|
2014 |
+
"normalized": false,
|
2015 |
+
"rstrip": false,
|
2016 |
+
"single_word": false,
|
2017 |
+
"special": true
|
2018 |
+
},
|
2019 |
+
"128252": {
|
2020 |
+
"content": "<|reserved_special_token_244|>",
|
2021 |
+
"lstrip": false,
|
2022 |
+
"normalized": false,
|
2023 |
+
"rstrip": false,
|
2024 |
+
"single_word": false,
|
2025 |
+
"special": true
|
2026 |
+
},
|
2027 |
+
"128253": {
|
2028 |
+
"content": "<|reserved_special_token_245|>",
|
2029 |
+
"lstrip": false,
|
2030 |
+
"normalized": false,
|
2031 |
+
"rstrip": false,
|
2032 |
+
"single_word": false,
|
2033 |
+
"special": true
|
2034 |
+
},
|
2035 |
+
"128254": {
|
2036 |
+
"content": "<|reserved_special_token_246|>",
|
2037 |
+
"lstrip": false,
|
2038 |
+
"normalized": false,
|
2039 |
+
"rstrip": false,
|
2040 |
+
"single_word": false,
|
2041 |
+
"special": true
|
2042 |
+
},
|
2043 |
+
"128255": {
|
2044 |
+
"content": "<|reserved_special_token_247|>",
|
2045 |
+
"lstrip": false,
|
2046 |
+
"normalized": false,
|
2047 |
+
"rstrip": false,
|
2048 |
+
"single_word": false,
|
2049 |
+
"special": true
|
2050 |
+
}
|
2051 |
+
},
|
2052 |
+
"bos_token": "<|begin_of_text|>",
|
2053 |
+
"chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n",
|
2054 |
+
"clean_up_tokenization_spaces": true,
|
2055 |
+
"eos_token": "<|eot_id|>",
|
2056 |
+
"model_input_names": [
|
2057 |
+
"input_ids",
|
2058 |
+
"attention_mask"
|
2059 |
+
],
|
2060 |
+
"model_max_length": 131072,
|
2061 |
+
"pad_token": "<|finetune_right_pad_id|>",
|
2062 |
+
"padding_side": "right",
|
2063 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
2064 |
+
}
|
crawl/crawl
CHANGED
@@ -124,9 +124,9 @@ def save_result(target_url):
|
|
124 |
|
125 |
# Choose the appropriate base path based on the operating system
|
126 |
if platform.system() == "Windows":
|
127 |
-
base_path = "E:\\datasets\\
|
128 |
-
|
129 |
-
base_path = "/home/kade/datasets/
|
130 |
|
131 |
save_dir = os.path.join(base_path, sanitized_title)
|
132 |
os.makedirs(save_dir, exist_ok=True)
|
|
|
124 |
|
125 |
# Choose the appropriate base path based on the operating system
|
126 |
if platform.system() == "Windows":
|
127 |
+
base_path = "E:\\datasets\\ragpile\\Saved Websites\\"
|
128 |
+
else:
|
129 |
+
base_path = "/home/kade/datasets/ragpile/Saved Websites"
|
130 |
|
131 |
save_dir = os.path.join(base_path, sanitized_title)
|
132 |
os.makedirs(save_dir, exist_ok=True)
|
crawl/crawl_wikipedia
CHANGED
@@ -126,9 +126,9 @@ def save_result(target_url):
|
|
126 |
|
127 |
# Choose the appropriate base path based on the operating system
|
128 |
if platform.system() == "Windows":
|
129 |
-
base_path = "E:\\
|
130 |
else:
|
131 |
-
base_path = "/home/kade/
|
132 |
|
133 |
save_dir = os.path.join(base_path, sanitized_title)
|
134 |
os.makedirs(save_dir, exist_ok=True)
|
|
|
126 |
|
127 |
# Choose the appropriate base path based on the operating system
|
128 |
if platform.system() == "Windows":
|
129 |
+
base_path = "E:\\ragpile\\Saved Websites\\"
|
130 |
else:
|
131 |
+
base_path = "/home/kade/datasets/ragpile/Saved Websites/"
|
132 |
|
133 |
save_dir = os.path.join(base_path, sanitized_title)
|
134 |
os.makedirs(save_dir, exist_ok=True)
|
joy
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
"""
|
5 |
-
JoyCaption Alpha
|
6 |
|
7 |
This module provides functionality for generating captions for images using a
|
8 |
combination of CLIP, LLM, and custom image adapters. It supports various
|
@@ -18,6 +18,7 @@ import os
|
|
18 |
import argparse
|
19 |
import re
|
20 |
import random
|
|
|
21 |
from pathlib import Path
|
22 |
from typing import List, Tuple, Dict
|
23 |
from PIL import Image
|
@@ -33,49 +34,56 @@ from transformers import (
|
|
33 |
)
|
34 |
from torch import nn
|
35 |
from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
|
|
|
36 |
|
37 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
38 |
MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
|
39 |
-
CHECKPOINT_PATH = Path(__file__).resolve().parent / "
|
40 |
CAPTION_TYPE_MAP = {
|
41 |
-
|
42 |
-
"Write a descriptive caption for this image in a formal tone."
|
|
|
|
|
43 |
],
|
44 |
-
|
45 |
-
"Write a descriptive caption for this image in a
|
46 |
-
"{word_count} words."
|
|
|
47 |
],
|
48 |
-
|
49 |
-
"Write a
|
|
|
|
|
50 |
],
|
51 |
-
|
52 |
-
"Write a
|
|
|
|
|
53 |
],
|
54 |
-
|
55 |
-
"Write a
|
56 |
-
"{word_count} words."
|
|
|
57 |
],
|
58 |
-
|
59 |
-
"Write a
|
|
|
|
|
60 |
],
|
61 |
-
|
62 |
-
"
|
|
|
|
|
63 |
],
|
64 |
-
|
65 |
-
"Write a
|
66 |
-
"{word_count} words."
|
|
|
67 |
],
|
68 |
-
|
69 |
-
"Write a
|
70 |
-
|
71 |
-
|
72 |
-
"Write a list of Booru tags for this image."
|
73 |
-
],
|
74 |
-
("rng-tags", "formal", False, True): [
|
75 |
-
"Write a list of Booru tags for this image within {word_count} words."
|
76 |
-
],
|
77 |
-
("rng-tags", "formal", True, False): [
|
78 |
-
"Write a {length} list of Booru tags for this image."
|
79 |
],
|
80 |
}
|
81 |
|
@@ -176,8 +184,9 @@ class ImageAdapter(nn.Module):
|
|
176 |
x = self.linear2(x)
|
177 |
|
178 |
other_tokens = self.other_tokens(
|
179 |
-
torch.tensor([0, 1], device=self.other_tokens.weight.device)
|
180 |
-
|
|
|
181 |
)
|
182 |
assert other_tokens.shape == (
|
183 |
x.shape[0],
|
@@ -200,6 +209,13 @@ class ImageAdapter(nn.Module):
|
|
200 |
).squeeze(0)
|
201 |
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
class JoyCaptionModel:
|
204 |
"""
|
205 |
A class for generating captions for images using CLIP, LLM,
|
@@ -219,7 +235,7 @@ class JoyCaptionModel:
|
|
219 |
|
220 |
Methods:
|
221 |
load_models(): Load and initialize all required models.
|
222 |
-
process_image(input_image, caption_type,
|
223 |
Process an input image and generate a caption
|
224 |
based on specified parameters.
|
225 |
"""
|
@@ -234,18 +250,17 @@ class JoyCaptionModel:
|
|
234 |
"""
|
235 |
Load and initialize all required models (CLIP, LLM, image adapter).
|
236 |
"""
|
237 |
-
|
238 |
self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
|
239 |
self.clip_model = self.clip_model.vision_model
|
240 |
|
241 |
if (CHECKPOINT_PATH / "clip_model.pt").exists():
|
242 |
-
|
243 |
checkpoint = torch.load(
|
244 |
CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
|
245 |
)
|
246 |
checkpoint = {
|
247 |
-
k.replace("_orig_mod.module.", ""): v
|
248 |
-
for k, v in checkpoint.items()
|
249 |
}
|
250 |
self.clip_model.load_state_dict(checkpoint)
|
251 |
del checkpoint
|
@@ -254,21 +269,19 @@ class JoyCaptionModel:
|
|
254 |
self.clip_model.requires_grad_(False)
|
255 |
self.clip_model.to("cuda")
|
256 |
|
257 |
-
|
258 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
259 |
-
|
260 |
)
|
261 |
assert isinstance(
|
262 |
self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
|
263 |
)
|
264 |
|
265 |
-
|
266 |
if (CHECKPOINT_PATH / "text_model").exists():
|
267 |
-
|
268 |
self.text_model = AutoModelForCausalLM.from_pretrained(
|
269 |
-
CHECKPOINT_PATH / "text_model",
|
270 |
-
device_map=0,
|
271 |
-
torch_dtype=torch.bfloat16
|
272 |
)
|
273 |
else:
|
274 |
self.text_model = AutoModelForCausalLM.from_pretrained(
|
@@ -277,7 +290,7 @@ class JoyCaptionModel:
|
|
277 |
|
278 |
self.text_model.eval()
|
279 |
|
280 |
-
|
281 |
self.image_adapter = ImageAdapter(
|
282 |
self.clip_model.config.hidden_size,
|
283 |
self.text_model.config.hidden_size,
|
@@ -287,10 +300,7 @@ class JoyCaptionModel:
|
|
287 |
False,
|
288 |
)
|
289 |
self.image_adapter.load_state_dict(
|
290 |
-
torch.load(
|
291 |
-
CHECKPOINT_PATH / "image_adapter.pt",
|
292 |
-
map_location="cpu"
|
293 |
-
)
|
294 |
)
|
295 |
self.image_adapter.eval()
|
296 |
self.image_adapter.to("cuda")
|
@@ -299,72 +309,120 @@ class JoyCaptionModel:
|
|
299 |
def process_image(
|
300 |
self,
|
301 |
input_image: Image.Image,
|
302 |
-
|
303 |
-
|
304 |
-
caption_length: str | int,
|
305 |
-
custom_prompt: str | None = None,
|
306 |
-
) -> str:
|
307 |
"""
|
308 |
-
Process an input image and generate a caption based on specified
|
309 |
-
|
|
|
|
|
|
|
310 |
"""
|
311 |
torch.cuda.empty_cache()
|
312 |
|
313 |
-
if custom_prompt is not None:
|
314 |
-
prompt_str = custom_prompt
|
315 |
-
else:
|
316 |
-
prompt_str = self._get_prompt_string(
|
317 |
-
caption_type, caption_tone, caption_length
|
318 |
-
)
|
319 |
-
print(f"Prompt: {prompt_str}")
|
320 |
-
|
321 |
pixel_values = self._preprocess_image(input_image)
|
322 |
-
prompt = self._tokenize_prompt(prompt_str)
|
323 |
|
324 |
embedded_images = self._embed_image(pixel_values)
|
325 |
inputs_embeds, input_ids, attention_mask = self._construct_inputs(
|
326 |
-
embedded_images,
|
327 |
)
|
328 |
|
329 |
-
generate_ids = self._generate_caption(inputs_embeds,
|
330 |
-
input_ids,
|
331 |
-
attention_mask)
|
332 |
caption = self._decode_caption(generate_ids, input_ids)
|
333 |
|
334 |
-
|
|
|
|
|
|
|
|
|
335 |
|
336 |
def generate_valid_caption(
|
337 |
self,
|
338 |
input_image: Image.Image,
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
343 |
) -> str:
|
344 |
"""
|
345 |
-
Generate a valid caption, retrying if
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
"""
|
350 |
while True:
|
351 |
-
caption = self.process_image(
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
return caption
|
365 |
-
print("Generated caption is invalid. Retrying...")
|
366 |
|
367 |
-
|
|
|
368 |
length = None if caption_length == "any" else caption_length
|
369 |
|
370 |
if isinstance(length, str):
|
@@ -373,103 +431,128 @@ class JoyCaptionModel:
|
|
373 |
except ValueError:
|
374 |
pass
|
375 |
|
376 |
-
|
377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
isinstance(length, str),
|
383 |
-
isinstance(length, int),
|
384 |
-
)
|
385 |
-
if prompt_key not in CAPTION_TYPE_MAP:
|
386 |
-
raise ValueError(f"Invalid caption type: {prompt_key}")
|
387 |
|
388 |
-
prompt_str = CAPTION_TYPE_MAP[
|
389 |
-
|
390 |
-
)
|
391 |
return prompt_str
|
392 |
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
395 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
396 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
397 |
-
|
398 |
-
return pixel_values
|
399 |
|
400 |
-
def
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
add_special_tokens=False,
|
407 |
-
)
|
408 |
-
return prompt
|
409 |
|
410 |
-
|
|
|
|
|
411 |
with torch.amp.autocast_mode.autocast("cuda", enabled=True):
|
412 |
vision_outputs = self.clip_model(
|
413 |
pixel_values=pixel_values, output_hidden_states=True
|
414 |
)
|
415 |
-
|
416 |
-
|
417 |
-
embedded_images = embedded_images.to("cuda")
|
418 |
-
return embedded_images
|
419 |
-
|
420 |
-
def _construct_inputs(self, embedded_images, prompt):
|
421 |
-
prompt_embeds = self.text_model.model.embed_tokens(prompt.to("cuda"))
|
422 |
-
assert prompt_embeds.shape == (
|
423 |
-
1,
|
424 |
-
prompt.shape[1],
|
425 |
-
self.text_model.config.hidden_size,
|
426 |
-
), (
|
427 |
-
f"Prompt shape is {prompt_embeds.shape}, expected "
|
428 |
-
f"{(1, prompt.shape[1], self.text_model.config.hidden_size)}"
|
429 |
-
)
|
430 |
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
-
|
440 |
-
|
441 |
-
.unsqueeze(0)
|
442 |
-
.to(dtype=self.text_model.dtype)
|
443 |
)
|
444 |
|
445 |
-
|
|
|
446 |
[
|
447 |
-
|
448 |
-
embedded_images.to(dtype=
|
449 |
-
|
450 |
-
eot_embed.expand(embedded_images.shape[0], -1, -1),
|
451 |
],
|
452 |
dim=1,
|
453 |
-
)
|
454 |
|
455 |
input_ids = torch.cat(
|
456 |
[
|
457 |
-
|
458 |
-
|
459 |
-
),
|
460 |
-
torch.zeros(
|
461 |
-
(1, embedded_images.shape[1]), dtype=torch.long
|
462 |
-
),
|
463 |
-
prompt,
|
464 |
-
torch.tensor(
|
465 |
-
[[self.tokenizer.eos_token_id]], dtype=torch.long
|
466 |
-
),
|
467 |
],
|
468 |
dim=1,
|
469 |
).to("cuda")
|
|
|
470 |
attention_mask = torch.ones_like(input_ids)
|
471 |
|
472 |
-
return
|
473 |
|
474 |
def _generate_caption(self, inputs_embeds, input_ids, attention_mask):
|
475 |
generate_ids = self.text_model.generate(
|
@@ -477,6 +560,7 @@ class JoyCaptionModel:
|
|
477 |
inputs_embeds=inputs_embeds,
|
478 |
attention_mask=attention_mask,
|
479 |
max_new_tokens=300,
|
|
|
480 |
do_sample=True,
|
481 |
suppress_tokens=None,
|
482 |
repetition_penalty=1.2,
|
@@ -484,20 +568,73 @@ class JoyCaptionModel:
|
|
484 |
return generate_ids
|
485 |
|
486 |
def _decode_caption(self, generate_ids, input_ids):
|
487 |
-
generate_ids = generate_ids[:, input_ids.shape[1]:]
|
488 |
|
489 |
-
if
|
490 |
-
|
491 |
-
|
492 |
generate_ids = generate_ids[:, :-1]
|
493 |
|
494 |
caption = self.tokenizer.batch_decode(
|
495 |
-
generate_ids,
|
496 |
-
skip_special_tokens=False,
|
497 |
-
clean_up_tokenization_spaces=False
|
498 |
)[0]
|
499 |
return caption
|
500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
|
502 |
def main():
|
503 |
"""
|
@@ -517,36 +654,21 @@ def main():
|
|
517 |
"--caption_type",
|
518 |
type=str,
|
519 |
default="descriptive",
|
520 |
-
choices=
|
521 |
help="Type of caption to generate.",
|
522 |
)
|
523 |
parser.add_argument(
|
524 |
-
"--
|
525 |
-
type=str,
|
526 |
-
default="formal",
|
527 |
-
choices=["formal", "informal"],
|
528 |
-
help="Tone of the caption.",
|
529 |
-
)
|
530 |
-
parser.add_argument(
|
531 |
-
"--caption_length",
|
532 |
-
type=str,
|
533 |
-
default="any",
|
534 |
-
help="Length of the caption."
|
535 |
)
|
536 |
parser.add_argument(
|
537 |
"--dont-strip-commas",
|
538 |
action="store_true",
|
539 |
-
help=(
|
540 |
-
"If set, commas will not be stripped from the generated captions."
|
541 |
-
),
|
542 |
)
|
543 |
parser.add_argument(
|
544 |
"--custom_prompt",
|
545 |
type=str,
|
546 |
-
help=(
|
547 |
-
"Custom prompt for the captioner. "
|
548 |
-
"Use with --caption_type custom."
|
549 |
-
),
|
550 |
)
|
551 |
parser.add_argument(
|
552 |
"--add-commas-to-sentence-ends",
|
@@ -564,6 +686,11 @@ def main():
|
|
564 |
"Optionally specify the number of tags to use."
|
565 |
),
|
566 |
)
|
|
|
|
|
|
|
|
|
|
|
567 |
parser.add_argument(
|
568 |
"--random-tags",
|
569 |
type=int,
|
@@ -572,81 +699,105 @@ def main():
|
|
572 |
"Only works if --feed-from-tags is enabled."
|
573 |
),
|
574 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
|
576 |
args = parser.parse_args()
|
577 |
|
578 |
-
|
579 |
-
if args.random_tags is not None and args.feed_from_tags is None:
|
580 |
-
parser.error(
|
581 |
-
"--random-tags can only be used when --feed-from-tags is enabled"
|
582 |
-
)
|
583 |
-
|
584 |
-
print("Loading e621 tag data")
|
585 |
-
tagset_normalizer = make_tagset_normalizer()
|
586 |
-
|
587 |
-
# Initialize and load models
|
588 |
-
joy_caption_model = JoyCaptionModel()
|
589 |
-
joy_caption_model.load_models()
|
590 |
-
|
591 |
-
# Validate custom prompt usage
|
592 |
-
if args.caption_type == "custom" and not args.custom_prompt:
|
593 |
-
parser.error(
|
594 |
-
"--custom_prompt is required when using --caption_type custom"
|
595 |
-
)
|
596 |
-
elif args.caption_type != "custom" and args.custom_prompt:
|
597 |
-
parser.error(
|
598 |
-
"--custom_prompt can only be used with --caption_type custom"
|
599 |
-
)
|
600 |
|
|
|
601 |
image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
|
602 |
for image_path in Path(args.directory).rglob("*"):
|
603 |
if image_path.suffix.lower() in image_extensions:
|
604 |
caption_file = image_path.with_suffix(".caption")
|
605 |
-
|
606 |
# Skip if the caption file already exists
|
607 |
if caption_file.exists():
|
608 |
-
|
609 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
input_image = Image.open(image_path).convert("RGB")
|
612 |
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
elif args.feed_from_tags is not None:
|
618 |
-
custom_prompt = prompt_from_tags(
|
619 |
-
args, image_path, tagset_normalizer
|
620 |
-
)
|
621 |
|
622 |
-
|
|
|
|
|
|
|
623 |
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
args.caption_tone,
|
628 |
-
args.caption_length,
|
629 |
-
custom_prompt=custom_prompt,
|
630 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
|
641 |
-
|
642 |
-
|
643 |
|
644 |
-
|
645 |
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
|
651 |
|
652 |
RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
|
@@ -723,11 +874,16 @@ TAG_CHARACTER = tag_category2id["character"]
|
|
723 |
TAG_ARTIST = tag_category2id["artist"]
|
724 |
TAG_COPYRIGHT = tag_category2id["copyright"]
|
725 |
TAG_META = tag_category2id["meta"]
|
726 |
-
TAG_FREQ_THRESH = 0
|
727 |
|
728 |
|
729 |
-
def prompt_from_tags(
|
730 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
"""
|
732 |
Generates a prompt from tags associated with the given image.
|
733 |
|
@@ -737,32 +893,35 @@ def prompt_from_tags(args, image_path: Path,
|
|
737 |
The path to the image file.
|
738 |
tagset_normalizer (TagSetNormalizer):
|
739 |
An instance to normalize the tag set.
|
740 |
-
|
741 |
-
Returns:
|
742 |
-
None
|
743 |
"""
|
|
|
744 |
tag_file = find_tag_file(image_path)
|
745 |
if tag_file is None:
|
746 |
-
|
|
|
747 |
|
748 |
with open(tag_file, "r", encoding="utf-8") as f:
|
749 |
tags = f.read().lower().split(",")
|
750 |
|
|
|
751 |
tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
|
752 |
encode = tagset_normalizer.tag_normalizer.encode
|
753 |
|
754 |
-
#
|
|
|
755 |
tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
|
756 |
-
cat: []
|
757 |
-
for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
|
758 |
}
|
759 |
other_tags: List[Tuple[int, str, int]] = []
|
760 |
implied: set = set()
|
|
|
|
|
761 |
for tag in tags:
|
762 |
tag = tag.strip()
|
763 |
# Encode the tag into a numerical id
|
764 |
tag_id = encode(tag.replace(" ", "_"))
|
765 |
if tag_id is None:
|
|
|
766 |
other_tags.append((0, tag, 0))
|
767 |
implied.update(tagset_normalizer.implications_rej.get(0, ()))
|
768 |
continue
|
@@ -771,28 +930,29 @@ def prompt_from_tags(args, image_path: Path,
|
|
771 |
# Skip meta tags
|
772 |
if cat_id == TAG_META:
|
773 |
continue
|
|
|
774 |
implied.update(tagset_normalizer.implications.get(tag_id, ()))
|
775 |
# Get the frequency of the tag
|
776 |
freq = tag_rank_to_freq(tag_id)
|
777 |
-
if freq <
|
778 |
continue
|
779 |
-
|
780 |
-
|
781 |
-
)
|
782 |
|
|
|
783 |
other_tags = sorted(
|
784 |
-
(
|
785 |
for freq, tag, tag_id in other_tags
|
786 |
if tag_id not in implied
|
787 |
)
|
788 |
|
|
|
789 |
for cat_id, cat_list in tag_by_category.items():
|
790 |
tag_by_category[cat_id] = sorted(
|
791 |
-
(
|
792 |
-
for freq, tag, tag_id in cat_list
|
793 |
-
if tag_id not in implied
|
794 |
)
|
795 |
|
|
|
796 |
if args.random_tags is not None:
|
797 |
# Randomly select tags if --random-tags is specified
|
798 |
num_tags = min(args.random_tags, len(other_tags))
|
@@ -807,11 +967,10 @@ def prompt_from_tags(args, image_path: Path,
|
|
807 |
# Use specified number of tags if --feed-from-tags has a positive value
|
808 |
other_tags = other_tags[: args.feed_from_tags]
|
809 |
|
810 |
-
# Prepare sentence pieces
|
811 |
artist_tag = tag_by_category[TAG_ARTIST]
|
812 |
if artist_tag:
|
813 |
-
artist_list = [str(tp[1]).removeprefix(
|
814 |
-
for tp in artist_tag[:4]]
|
815 |
artist_txt = f"by {format_nl_list(artist_list)}"
|
816 |
else:
|
817 |
artist_txt = ""
|
@@ -826,15 +985,13 @@ def prompt_from_tags(args, image_path: Path,
|
|
826 |
species_tag = tag_by_category[TAG_SPECIES]
|
827 |
if species_tag:
|
828 |
species_txt = (
|
829 |
-
"of a "
|
830 |
-
if len(character_tag) <= 1 and len(species_tag) <= 1
|
831 |
-
else "of "
|
832 |
)
|
833 |
species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
|
834 |
else:
|
835 |
if character_tag:
|
836 |
species_txt = (
|
837 |
-
" a character" if len(character_tag) <= 1 else " characters"
|
838 |
)
|
839 |
else:
|
840 |
species_txt = ""
|
@@ -845,13 +1002,32 @@ def prompt_from_tags(args, image_path: Path,
|
|
845 |
copyright_txt = f"from {format_nl_list(tags)}"
|
846 |
else:
|
847 |
copyright_txt = ""
|
|
|
|
|
848 |
tag_string = ", ".join(tp[1] for tp in other_tags)
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
853 |
tag_string,
|
854 |
-
|
|
|
|
|
|
|
|
|
855 |
return custom_prompt
|
856 |
|
857 |
|
@@ -877,5 +1053,12 @@ def find_tag_file(image_path):
|
|
877 |
return None
|
878 |
|
879 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
880 |
if __name__ == "__main__":
|
881 |
main()
|
|
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
"""
|
5 |
+
JoyCaption Alpha Two
|
6 |
|
7 |
This module provides functionality for generating captions for images using a
|
8 |
combination of CLIP, LLM, and custom image adapters. It supports various
|
|
|
18 |
import argparse
|
19 |
import re
|
20 |
import random
|
21 |
+
import math
|
22 |
from pathlib import Path
|
23 |
from typing import List, Tuple, Dict
|
24 |
from PIL import Image
|
|
|
34 |
)
|
35 |
from torch import nn
|
36 |
from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
|
37 |
+
import logging
|
38 |
|
39 |
CLIP_PATH = "google/siglip-so400m-patch14-384"
|
40 |
MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
|
41 |
+
CHECKPOINT_PATH = Path(__file__).resolve().parent / "cgrkzexw-599808"
|
42 |
CAPTION_TYPE_MAP = {
|
43 |
+
"descriptive": [
|
44 |
+
"Write a descriptive caption for this image in a formal tone.",
|
45 |
+
"Write a descriptive caption for this image in a formal tone within {word_count} words.",
|
46 |
+
"Write a {length} descriptive caption for this image in a formal tone.",
|
47 |
],
|
48 |
+
"descriptive (informal)": [
|
49 |
+
"Write a descriptive caption for this image in a casual tone.",
|
50 |
+
"Write a descriptive caption for this image in a casual tone within {word_count} words.",
|
51 |
+
"Write a {length} descriptive caption for this image in a casual tone.",
|
52 |
],
|
53 |
+
"training prompt": [
|
54 |
+
"Write a stable diffusion prompt for this image.",
|
55 |
+
"Write a stable diffusion prompt for this image within {word_count} words.",
|
56 |
+
"Write a {length} stable diffusion prompt for this image.",
|
57 |
],
|
58 |
+
"midjourney": [
|
59 |
+
"Write a MidJourney prompt for this image.",
|
60 |
+
"Write a MidJourney prompt for this image within {word_count} words.",
|
61 |
+
"Write a {length} MidJourney prompt for this image.",
|
62 |
],
|
63 |
+
"booru tag list": [
|
64 |
+
"Write a list of Booru tags for this image.",
|
65 |
+
"Write a list of Booru tags for this image within {word_count} words.",
|
66 |
+
"Write a {length} list of Booru tags for this image.",
|
67 |
],
|
68 |
+
"booru-like tag list": [
|
69 |
+
"Write a list of Booru-like tags for this image.",
|
70 |
+
"Write a list of Booru-like tags for this image within {word_count} words.",
|
71 |
+
"Write a {length} list of Booru-like tags for this image.",
|
72 |
],
|
73 |
+
"art critic": [
|
74 |
+
"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
|
75 |
+
"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
|
76 |
+
"Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
|
77 |
],
|
78 |
+
"product listing": [
|
79 |
+
"Write a caption for this image as though it were a product listing.",
|
80 |
+
"Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
|
81 |
+
"Write a {length} caption for this image as though it were a product listing.",
|
82 |
],
|
83 |
+
"social media post": [
|
84 |
+
"Write a caption for this image as if it were being used for a social media post.",
|
85 |
+
"Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
|
86 |
+
"Write a {length} caption for this image as if it were being used for a social media post.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
],
|
88 |
}
|
89 |
|
|
|
184 |
x = self.linear2(x)
|
185 |
|
186 |
other_tokens = self.other_tokens(
|
187 |
+
torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(
|
188 |
+
x.shape[0], -1
|
189 |
+
)
|
190 |
)
|
191 |
assert other_tokens.shape == (
|
192 |
x.shape[0],
|
|
|
209 |
).squeeze(0)
|
210 |
|
211 |
|
212 |
+
STOP_WORDS: set[str] = set(
|
213 |
+
"i'll if we'd can't you'd shouldn't i'd only doesn't further isn't didn't has more aren't during do than were he's too here you against could few for ought won't we until weren't i've they're same up she but are how here's their over can under mustn't while on by had and an each he'd he about she'd am was she'll where's did out or that's it they'd a let's shall what's the to don't when below no any some from is hadn't all they i'm must in before who's own where you've that very them this not because it's shan't wasn't you'll when's most off i at other hasn't nor been such again we'll down above will so should into she's once have these why's be we've as being why those then with after may you're would haven't both wouldn't there cannot they've couldn't how's between does we're through he'll of there's they'll might".split(
|
214 |
+
" "
|
215 |
+
)
|
216 |
+
)
|
217 |
+
|
218 |
+
|
219 |
class JoyCaptionModel:
|
220 |
"""
|
221 |
A class for generating captions for images using CLIP, LLM,
|
|
|
235 |
|
236 |
Methods:
|
237 |
load_models(): Load and initialize all required models.
|
238 |
+
process_image(input_image, caption_type, caption_length):
|
239 |
Process an input image and generate a caption
|
240 |
based on specified parameters.
|
241 |
"""
|
|
|
250 |
"""
|
251 |
Load and initialize all required models (CLIP, LLM, image adapter).
|
252 |
"""
|
253 |
+
logging.info("Loading CLIP")
|
254 |
self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
|
255 |
self.clip_model = self.clip_model.vision_model
|
256 |
|
257 |
if (CHECKPOINT_PATH / "clip_model.pt").exists():
|
258 |
+
logging.info("Loading VLM's custom vision model")
|
259 |
checkpoint = torch.load(
|
260 |
CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
|
261 |
)
|
262 |
checkpoint = {
|
263 |
+
k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()
|
|
|
264 |
}
|
265 |
self.clip_model.load_state_dict(checkpoint)
|
266 |
del checkpoint
|
|
|
269 |
self.clip_model.requires_grad_(False)
|
270 |
self.clip_model.to("cuda")
|
271 |
|
272 |
+
logging.info("Loading tokenizer")
|
273 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
274 |
+
CHECKPOINT_PATH / "text_model", use_fast=True
|
275 |
)
|
276 |
assert isinstance(
|
277 |
self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
|
278 |
)
|
279 |
|
280 |
+
logging.info("Loading LLM")
|
281 |
if (CHECKPOINT_PATH / "text_model").exists():
|
282 |
+
logging.info("Loading VLM's custom text model")
|
283 |
self.text_model = AutoModelForCausalLM.from_pretrained(
|
284 |
+
CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
|
|
|
|
|
285 |
)
|
286 |
else:
|
287 |
self.text_model = AutoModelForCausalLM.from_pretrained(
|
|
|
290 |
|
291 |
self.text_model.eval()
|
292 |
|
293 |
+
logging.info("Loading image adapter")
|
294 |
self.image_adapter = ImageAdapter(
|
295 |
self.clip_model.config.hidden_size,
|
296 |
self.text_model.config.hidden_size,
|
|
|
300 |
False,
|
301 |
)
|
302 |
self.image_adapter.load_state_dict(
|
303 |
+
torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")
|
|
|
|
|
|
|
304 |
)
|
305 |
self.image_adapter.eval()
|
306 |
self.image_adapter.to("cuda")
|
|
|
309 |
def process_image(
|
310 |
self,
|
311 |
input_image: Image.Image,
|
312 |
+
prompt_str: str,
|
313 |
+
) -> Tuple[str, float]:
|
|
|
|
|
|
|
314 |
"""
|
315 |
+
Process an input image and generate a caption based on specified parameters.
|
316 |
+
Also calculates the entropy of the generated caption.
|
317 |
+
|
318 |
+
Returns:
|
319 |
+
Tuple[str, float]: The generated caption and its entropy.
|
320 |
"""
|
321 |
torch.cuda.empty_cache()
|
322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
pixel_values = self._preprocess_image(input_image)
|
|
|
324 |
|
325 |
embedded_images = self._embed_image(pixel_values)
|
326 |
inputs_embeds, input_ids, attention_mask = self._construct_inputs(
|
327 |
+
embedded_images, prompt_str
|
328 |
)
|
329 |
|
330 |
+
generate_ids = self._generate_caption(inputs_embeds, input_ids, attention_mask)
|
|
|
|
|
331 |
caption = self._decode_caption(generate_ids, input_ids)
|
332 |
|
333 |
+
# Calculate entropy
|
334 |
+
token_ids = generate_ids[0].tolist()
|
335 |
+
entropy = self._calculate_entropy(token_ids)
|
336 |
+
|
337 |
+
return caption.strip(), entropy
|
338 |
|
339 |
def generate_valid_caption(
|
340 |
self,
|
341 |
input_image: Image.Image,
|
342 |
+
prompt: str,
|
343 |
+
*,
|
344 |
+
limited_words: Dict[str, int] = {"fluffy": 2},
|
345 |
+
min_sentence_count: int = 3,
|
346 |
+
max_word_repetitions: int = 5,
|
347 |
+
min_entropy: float = 1.75,
|
348 |
+
stop_words: set[str] = STOP_WORDS,
|
349 |
) -> str:
|
350 |
"""
|
351 |
+
Generate a valid caption, retrying if certain conditions are not met.
|
352 |
+
|
353 |
+
Args:
|
354 |
+
input_image (Image.Image): The input image to caption.
|
355 |
+
prompt (str | None): Prompt for caption generation.
|
356 |
+
limited_words (Dict[str, int]): Dictionary of words with their maximum allowed occurrences. Default is {"fluffy": 1}.
|
357 |
+
min_sentence_count (int): Minimum required number of sentences. Default is 3.
|
358 |
+
max_word_repetitions (int): Maximum allowed repetitions for words longer than 4 characters. Default is 15.
|
359 |
+
min_entropy (float): Minimum required entropy of the caption. Default is 2.3.
|
360 |
+
|
361 |
+
Returns:
|
362 |
+
str: A valid caption meeting all specified criteria.
|
363 |
+
|
364 |
+
The method retries caption generation if:
|
365 |
+
- The caption contains only special characters
|
366 |
+
- The caption does not end with a period, exclamation mark, or question mark
|
367 |
+
- Any word in limited_words appears more than its specified maximum times
|
368 |
+
- Any word longer than 4 characters is repeated more than max_word_repetitions times
|
369 |
+
- The caption contains fewer than min_sentence_count sentences
|
370 |
+
- The entropy of the caption is below min_entropy
|
371 |
"""
|
372 |
while True:
|
373 |
+
caption, entropy = self.process_image(input_image, prompt)
|
374 |
+
words = re.findall(r"\b\w+\b", caption.lower())
|
375 |
+
word_counts = {
|
376 |
+
word: words.count(word) for word in set(words) if word not in stop_words
|
377 |
+
}
|
378 |
+
sentence_count = len(re.findall(r"[.!?]", caption))
|
379 |
+
|
380 |
+
if not re.search(r"\w", caption):
|
381 |
+
logging.info(
|
382 |
+
f"Retrying: Caption contains only special characters.\nCaption: {caption!r}"
|
383 |
+
)
|
384 |
+
elif caption[-1] not in {".", "!", "?"}:
|
385 |
+
logging.info(
|
386 |
+
f"Retrying: Caption does not end with proper punctuation.\nCaption: {caption!r}"
|
387 |
+
)
|
388 |
+
elif any(
|
389 |
+
caption.lower().count(word) > max_count
|
390 |
+
for word, max_count in limited_words.items()
|
391 |
+
):
|
392 |
+
exceeded_words = [
|
393 |
+
f"{word} ({caption.lower().count(word)}/{max_count})"
|
394 |
+
for word, max_count in limited_words.items()
|
395 |
+
if caption.lower().count(word) > max_count
|
396 |
+
]
|
397 |
+
logging.info(
|
398 |
+
f"Retrying: Limited words exceeded: {', '.join(exceeded_words)}.\nCaption: {caption!r}"
|
399 |
+
)
|
400 |
+
elif any(
|
401 |
+
count > max_word_repetitions
|
402 |
+
for word, count in word_counts.items()
|
403 |
+
if len(word) > 4
|
404 |
+
):
|
405 |
+
repeated_words = [
|
406 |
+
word
|
407 |
+
for word, count in word_counts.items()
|
408 |
+
if count > max_word_repetitions and len(word) > 4
|
409 |
+
]
|
410 |
+
logging.info(
|
411 |
+
f"Retrying: Words repeated more than {max_word_repetitions} times: {', '.join(repeated_words)}.\nCaption: {caption!r}"
|
412 |
+
)
|
413 |
+
elif sentence_count < min_sentence_count:
|
414 |
+
logging.info(
|
415 |
+
f"Retrying: Only {sentence_count} sentences (min: {min_sentence_count}).\nCaption: {caption!r}"
|
416 |
+
)
|
417 |
+
elif entropy < min_entropy:
|
418 |
+
logging.info(
|
419 |
+
f"Retrying: Low entropy ({entropy:.2f} < {min_entropy}).\nCaption: {caption!r}"
|
420 |
+
)
|
421 |
+
else:
|
422 |
return caption
|
|
|
423 |
|
424 |
+
@staticmethod
|
425 |
+
def get_prompt_string(caption_type, caption_length):
|
426 |
length = None if caption_length == "any" else caption_length
|
427 |
|
428 |
if isinstance(length, str):
|
|
|
431 |
except ValueError:
|
432 |
pass
|
433 |
|
434 |
+
# Build prompt
|
435 |
+
if length is None:
|
436 |
+
map_idx = 0
|
437 |
+
elif isinstance(length, int):
|
438 |
+
map_idx = 1
|
439 |
+
elif isinstance(length, str):
|
440 |
+
map_idx = 2
|
441 |
+
else:
|
442 |
+
raise ValueError(f"Invalid caption length: {length}")
|
443 |
|
444 |
+
caption_type = caption_type.lower()
|
445 |
+
if caption_type not in CAPTION_TYPE_MAP:
|
446 |
+
raise ValueError(f"Invalid caption type: {caption_type}")
|
|
|
|
|
|
|
|
|
|
|
447 |
|
448 |
+
prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
|
449 |
+
prompt_str = prompt_str.format(length=caption_length, word_count=caption_length)
|
|
|
450 |
return prompt_str
|
451 |
|
452 |
+
@staticmethod
|
453 |
+
def _preprocess_image(input_image: Image.Image) -> torch.Tensor:
|
454 |
+
"""
|
455 |
+
Preprocess the input image for the CLIP model.
|
456 |
+
|
457 |
+
Args:
|
458 |
+
input_image (Image.Image): The input PIL image.
|
459 |
+
|
460 |
+
Returns:
|
461 |
+
torch.Tensor: Preprocessed image tensor.
|
462 |
+
"""
|
463 |
image = input_image.resize((384, 384), Image.LANCZOS)
|
464 |
pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
|
465 |
pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
|
466 |
+
return pixel_values.to("cuda")
|
|
|
467 |
|
468 |
+
def _embed_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
469 |
+
"""
|
470 |
+
Embed the preprocessed image using CLIP and the image adapter.
|
471 |
+
|
472 |
+
Args:
|
473 |
+
pixel_values (torch.Tensor): Preprocessed image tensor.
|
|
|
|
|
|
|
474 |
|
475 |
+
Returns:
|
476 |
+
torch.Tensor: Embedded image tensor.
|
477 |
+
"""
|
478 |
with torch.amp.autocast_mode.autocast("cuda", enabled=True):
|
479 |
vision_outputs = self.clip_model(
|
480 |
pixel_values=pixel_values, output_hidden_states=True
|
481 |
)
|
482 |
+
embedded_images = self.image_adapter(vision_outputs.hidden_states)
|
483 |
+
return embedded_images.to("cuda")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
+
def _construct_inputs(
|
486 |
+
self, embedded_images: torch.Tensor, prompt_str: str
|
487 |
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
488 |
+
"""
|
489 |
+
Construct the inputs for the text model's generate method.
|
490 |
+
|
491 |
+
Args:
|
492 |
+
embedded_images (torch.Tensor): Embedded image tensor.
|
493 |
+
prompt_str (str): The prompt string for captioning.
|
494 |
+
|
495 |
+
Returns:
|
496 |
+
tuple: (input_embeds, input_ids, attention_mask)
|
497 |
+
"""
|
498 |
+
# Build the conversation
|
499 |
+
convo = [
|
500 |
+
{"role": "system", "content": "You are a helpful image captioner."},
|
501 |
+
{"role": "user", "content": prompt_str},
|
502 |
+
]
|
503 |
+
|
504 |
+
# Format and tokenize the conversation
|
505 |
+
convo_string = self.tokenizer.apply_chat_template(
|
506 |
+
convo, tokenize=False, add_generation_prompt=True
|
507 |
)
|
508 |
+
logging.debug(f"Convo:\n{convo_string}")
|
509 |
+
convo_tokens = self.tokenizer.encode(
|
510 |
+
convo_string,
|
511 |
+
return_tensors="pt",
|
512 |
+
add_special_tokens=False,
|
513 |
+
truncation=False,
|
514 |
+
)
|
515 |
+
prompt_tokens = self.tokenizer.encode(
|
516 |
+
prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False
|
517 |
+
)
|
518 |
+
convo_tokens = convo_tokens.squeeze(0)
|
519 |
+
prompt_tokens = prompt_tokens.squeeze(0)
|
520 |
+
|
521 |
+
# Calculate where to inject the image
|
522 |
+
eot_id_indices = (
|
523 |
+
(convo_tokens == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"))
|
524 |
+
.nonzero(as_tuple=True)[0]
|
525 |
+
.tolist()
|
526 |
+
)
|
527 |
+
preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]
|
528 |
|
529 |
+
# Embed the tokens
|
530 |
+
convo_embeds = self.text_model.model.embed_tokens(
|
531 |
+
convo_tokens.unsqueeze(0).to("cuda")
|
|
|
532 |
)
|
533 |
|
534 |
+
# Construct the input
|
535 |
+
input_embeds = torch.cat(
|
536 |
[
|
537 |
+
convo_embeds[:, :preamble_len],
|
538 |
+
embedded_images.to(dtype=convo_embeds.dtype),
|
539 |
+
convo_embeds[:, preamble_len:],
|
|
|
540 |
],
|
541 |
dim=1,
|
542 |
+
).to("cuda")
|
543 |
|
544 |
input_ids = torch.cat(
|
545 |
[
|
546 |
+
convo_tokens[:preamble_len].unsqueeze(0),
|
547 |
+
torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
|
548 |
+
convo_tokens[preamble_len:].unsqueeze(0),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
],
|
550 |
dim=1,
|
551 |
).to("cuda")
|
552 |
+
|
553 |
attention_mask = torch.ones_like(input_ids)
|
554 |
|
555 |
+
return input_embeds, input_ids, attention_mask
|
556 |
|
557 |
def _generate_caption(self, inputs_embeds, input_ids, attention_mask):
|
558 |
generate_ids = self.text_model.generate(
|
|
|
560 |
inputs_embeds=inputs_embeds,
|
561 |
attention_mask=attention_mask,
|
562 |
max_new_tokens=300,
|
563 |
+
# max_length=4096,
|
564 |
do_sample=True,
|
565 |
suppress_tokens=None,
|
566 |
repetition_penalty=1.2,
|
|
|
568 |
return generate_ids
|
569 |
|
570 |
def _decode_caption(self, generate_ids, input_ids):
|
571 |
+
generate_ids = generate_ids[:, input_ids.shape[1] :]
|
572 |
|
573 |
+
if generate_ids[0][-1] == self.tokenizer.eos_token_id or generate_ids[0][
|
574 |
+
-1
|
575 |
+
] == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"):
|
576 |
generate_ids = generate_ids[:, :-1]
|
577 |
|
578 |
caption = self.tokenizer.batch_decode(
|
579 |
+
generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
|
|
|
|
|
580 |
)[0]
|
581 |
return caption
|
582 |
|
583 |
+
def _calculate_entropy(self, token_ids: List[int]) -> float:
|
584 |
+
"""
|
585 |
+
Calculate the entropy of a sequence of token IDs.
|
586 |
+
|
587 |
+
Args:
|
588 |
+
token_ids (List[int]): List of token IDs.
|
589 |
+
|
590 |
+
Returns:
|
591 |
+
float: Entropy of the token sequence.
|
592 |
+
"""
|
593 |
+
token_counts = {}
|
594 |
+
total_tokens = len(token_ids)
|
595 |
+
|
596 |
+
for token_id in token_ids:
|
597 |
+
token_counts[token_id] = token_counts.get(token_id, 0) + 1
|
598 |
+
|
599 |
+
entropy = 0
|
600 |
+
for count in token_counts.values():
|
601 |
+
probability = count / total_tokens
|
602 |
+
entropy -= probability * math.log2(probability)
|
603 |
+
|
604 |
+
return entropy
|
605 |
+
|
606 |
+
|
607 |
+
class ColoredFormatter(logging.Formatter):
|
608 |
+
COLORS = {
|
609 |
+
"DEBUG": "\033[36m", # Cyan
|
610 |
+
"INFO": "\033[32m", # Green
|
611 |
+
"WARNING": "\033[33m", # Yellow
|
612 |
+
"ERROR": "\033[31m", # Red
|
613 |
+
"CRITICAL": "\033[31;1m", # Bright Red
|
614 |
+
}
|
615 |
+
RESET = "\033[0m"
|
616 |
+
|
617 |
+
def format(self, record):
|
618 |
+
log_message = super().format(record)
|
619 |
+
return f"{self.COLORS.get(record.levelname, '')}{log_message}{self.RESET}"
|
620 |
+
|
621 |
+
|
622 |
+
def setup_logging(verbosity):
|
623 |
+
if verbosity == 0:
|
624 |
+
log_level = logging.INFO
|
625 |
+
elif verbosity == 1:
|
626 |
+
log_level = logging.DEBUG
|
627 |
+
|
628 |
+
handler = logging.StreamHandler()
|
629 |
+
formatter = ColoredFormatter(
|
630 |
+
fmt="%(asctime)s | %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
631 |
+
)
|
632 |
+
handler.setFormatter(formatter)
|
633 |
+
|
634 |
+
logger = logging.getLogger()
|
635 |
+
logger.setLevel(log_level)
|
636 |
+
logger.addHandler(handler)
|
637 |
+
|
638 |
|
639 |
def main():
|
640 |
"""
|
|
|
654 |
"--caption_type",
|
655 |
type=str,
|
656 |
default="descriptive",
|
657 |
+
choices=CAPTION_TYPE_MAP.keys(),
|
658 |
help="Type of caption to generate.",
|
659 |
)
|
660 |
parser.add_argument(
|
661 |
+
"--caption_length", type=str, default="any", help="Length of the caption."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
662 |
)
|
663 |
parser.add_argument(
|
664 |
"--dont-strip-commas",
|
665 |
action="store_true",
|
666 |
+
help=("If set, commas will not be stripped from the generated captions."),
|
|
|
|
|
667 |
)
|
668 |
parser.add_argument(
|
669 |
"--custom_prompt",
|
670 |
type=str,
|
671 |
+
help=("Custom prompt for the captioner. " "Use with --caption_type custom."),
|
|
|
|
|
|
|
672 |
)
|
673 |
parser.add_argument(
|
674 |
"--add-commas-to-sentence-ends",
|
|
|
686 |
"Optionally specify the number of tags to use."
|
687 |
),
|
688 |
)
|
689 |
+
parser.add_argument(
|
690 |
+
"--artist-from-folder",
|
691 |
+
action="store_true",
|
692 |
+
help="Get the artist name from the parent folder",
|
693 |
+
)
|
694 |
parser.add_argument(
|
695 |
"--random-tags",
|
696 |
type=int,
|
|
|
699 |
"Only works if --feed-from-tags is enabled."
|
700 |
),
|
701 |
)
|
702 |
+
parser.add_argument(
|
703 |
+
"--dry-run",
|
704 |
+
action="store_true",
|
705 |
+
help="Run in dry-run mode without loading models or generating captions.",
|
706 |
+
)
|
707 |
+
parser.add_argument(
|
708 |
+
"-v",
|
709 |
+
"--verbose",
|
710 |
+
action="count",
|
711 |
+
default=0,
|
712 |
+
help="Increase output verbosity (can be repeated)",
|
713 |
+
)
|
714 |
|
715 |
args = parser.parse_args()
|
716 |
|
717 |
+
setup_logging(args.verbose)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
|
719 |
+
tasks = []
|
720 |
image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
|
721 |
for image_path in Path(args.directory).rglob("*"):
|
722 |
if image_path.suffix.lower() in image_extensions:
|
723 |
caption_file = image_path.with_suffix(".caption")
|
|
|
724 |
# Skip if the caption file already exists
|
725 |
if caption_file.exists():
|
726 |
+
logging.info(f"Skipping {image_path}: Caption file already exists.")
|
727 |
continue
|
728 |
+
tasks.append((image_path, caption_file))
|
729 |
+
|
730 |
+
if not tasks:
|
731 |
+
logging.error("No input file found.")
|
732 |
+
return
|
733 |
+
|
734 |
+
# Validate random-tags usage
|
735 |
+
if args.random_tags is not None and args.feed_from_tags is None:
|
736 |
+
parser.error("--random-tags can only be used when --feed-from-tags is enabled")
|
737 |
+
|
738 |
+
if args.feed_from_tags is not None and args.artist_from_folder:
|
739 |
+
raise ValueError("feed-from-tags and artist-from-folder can't be used together")
|
740 |
+
|
741 |
+
if args.feed_from_tags is not None:
|
742 |
+
logging.info("Loading e621 tag data")
|
743 |
+
tagset_normalizer = make_tagset_normalizer()
|
744 |
|
745 |
+
# Initialize and load models only if not in dry-run mode
|
746 |
+
if not args.dry_run:
|
747 |
+
joy_caption_model = JoyCaptionModel()
|
748 |
+
joy_caption_model.load_models()
|
749 |
+
else:
|
750 |
+
logging.info("Running in dry-run mode. Models will not be loaded.")
|
751 |
+
|
752 |
+
for image_path, caption_file in tasks:
|
753 |
+
if not args.dry_run:
|
754 |
input_image = Image.open(image_path).convert("RGB")
|
755 |
|
756 |
+
# Use custom prompt if specified
|
757 |
+
prompt = args.custom_prompt or JoyCaptionModel.get_prompt_string(
|
758 |
+
args.caption_type, args.caption_length
|
759 |
+
)
|
|
|
|
|
|
|
|
|
760 |
|
761 |
+
if args.feed_from_tags is not None:
|
762 |
+
prompt = prompt_from_tags(args, image_path, tagset_normalizer, prompt)
|
763 |
+
elif args.artist_from_folder:
|
764 |
+
prompt = prompt_from_folder(prompt, image_path.resolve())
|
765 |
|
766 |
+
if args.dry_run:
|
767 |
+
logging.info(
|
768 |
+
f"Dry run: Skipping caption generation for {image_path} with prompt:\n\t{prompt}"
|
|
|
|
|
|
|
769 |
)
|
770 |
+
continue
|
771 |
+
else:
|
772 |
+
logging.info(f"Prompt for {image_path}:\n\t{prompt}")
|
773 |
+
|
774 |
+
caption = joy_caption_model.generate_valid_caption(input_image, prompt)
|
775 |
+
|
776 |
+
# Replace multiple spaces with a single space
|
777 |
+
caption = " ".join(caption.split())
|
778 |
+
# Replace multiple newlines with a single newline
|
779 |
+
caption = "\n".join(
|
780 |
+
line for line in (line.strip() for line in caption.split("\n")) if line
|
781 |
+
)
|
782 |
|
783 |
+
# Strip commas if the --dont-strip-commas flag is not set
|
784 |
+
if not args.dont_strip_commas:
|
785 |
+
# Existing comma stripping logic
|
786 |
+
caption = re.sub(r",\s*([^\d])", r" \1", caption)
|
787 |
|
788 |
+
# New feature: Add commas after periods if specified
|
789 |
+
if args.add_commas_to_sentence_ends:
|
790 |
+
caption = re.sub(r"(\.)(\s+)([A-Z])", r"\1,\2\3", caption)
|
791 |
|
792 |
+
# Remove all newline characters
|
793 |
+
caption = caption.replace("\n", " ")
|
794 |
|
795 |
+
logging.info(f"Caption for {image_path}:\n\t{caption}\n\n")
|
796 |
|
797 |
+
# Save the caption to a .caption file
|
798 |
+
with open(caption_file, "w", encoding="utf-8") as f:
|
799 |
+
f.write(caption)
|
800 |
+
logging.info(f"Caption saved to {caption_file}")
|
801 |
|
802 |
|
803 |
RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
|
|
|
874 |
TAG_ARTIST = tag_category2id["artist"]
|
875 |
TAG_COPYRIGHT = tag_category2id["copyright"]
|
876 |
TAG_META = tag_category2id["meta"]
|
|
|
877 |
|
878 |
|
879 |
+
def prompt_from_tags(
|
880 |
+
args,
|
881 |
+
image_path: Path,
|
882 |
+
tagset_normalizer: TagSetNormalizer,
|
883 |
+
base_prompt: str = "Write a descriptive caption for this image in a formal tone.",
|
884 |
+
tag_freq_threshold: int = 0,
|
885 |
+
tag_string_prefix: str = "Use these tags to construct your caption:",
|
886 |
+
):
|
887 |
"""
|
888 |
Generates a prompt from tags associated with the given image.
|
889 |
|
|
|
893 |
The path to the image file.
|
894 |
tagset_normalizer (TagSetNormalizer):
|
895 |
An instance to normalize the tag set.
|
|
|
|
|
|
|
896 |
"""
|
897 |
+
# Find and read the corresponding tag file
|
898 |
tag_file = find_tag_file(image_path)
|
899 |
if tag_file is None:
|
900 |
+
logging.warning(f"No tag file found for {image_path}")
|
901 |
+
return base_prompt
|
902 |
|
903 |
with open(tag_file, "r", encoding="utf-8") as f:
|
904 |
tags = f.read().lower().split(",")
|
905 |
|
906 |
+
# Get helper functions from the tagset_normalizer
|
907 |
tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
|
908 |
encode = tagset_normalizer.tag_normalizer.encode
|
909 |
|
910 |
+
# Initialize dictionaries and lists to store categorized tags
|
911 |
+
# These lists will contain tuples (freq, tag, tag_id)
|
912 |
tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
|
913 |
+
cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
|
|
|
914 |
}
|
915 |
other_tags: List[Tuple[int, str, int]] = []
|
916 |
implied: set = set()
|
917 |
+
|
918 |
+
# Process each tag
|
919 |
for tag in tags:
|
920 |
tag = tag.strip()
|
921 |
# Encode the tag into a numerical id
|
922 |
tag_id = encode(tag.replace(" ", "_"))
|
923 |
if tag_id is None:
|
924 |
+
# If tag is not recognized, add it to other_tags
|
925 |
other_tags.append((0, tag, 0))
|
926 |
implied.update(tagset_normalizer.implications_rej.get(0, ()))
|
927 |
continue
|
|
|
930 |
# Skip meta tags
|
931 |
if cat_id == TAG_META:
|
932 |
continue
|
933 |
+
# Update implied tags
|
934 |
implied.update(tagset_normalizer.implications.get(tag_id, ()))
|
935 |
# Get the frequency of the tag
|
936 |
freq = tag_rank_to_freq(tag_id)
|
937 |
+
if freq < tag_freq_threshold:
|
938 |
continue
|
939 |
+
# Add the tag to its category, or other_tags
|
940 |
+
tag_by_category.get(cat_id, other_tags).append((int(freq), tag, tag_id))
|
|
|
941 |
|
942 |
+
# Sort other_tags by frequency (descending) and filter out implied tags
|
943 |
other_tags = sorted(
|
944 |
+
(-freq, tag, tag_id)
|
945 |
for freq, tag, tag_id in other_tags
|
946 |
if tag_id not in implied
|
947 |
)
|
948 |
|
949 |
+
# Sort tags within each category, prefering non implied tags
|
950 |
for cat_id, cat_list in tag_by_category.items():
|
951 |
tag_by_category[cat_id] = sorted(
|
952 |
+
((tag_id in implied, -freq), tag, tag_id) for freq, tag, tag_id in cat_list
|
|
|
|
|
953 |
)
|
954 |
|
955 |
+
# Handle random tag selection or tag limit if specified
|
956 |
if args.random_tags is not None:
|
957 |
# Randomly select tags if --random-tags is specified
|
958 |
num_tags = min(args.random_tags, len(other_tags))
|
|
|
967 |
# Use specified number of tags if --feed-from-tags has a positive value
|
968 |
other_tags = other_tags[: args.feed_from_tags]
|
969 |
|
970 |
+
# Prepare sentence pieces for each category
|
971 |
artist_tag = tag_by_category[TAG_ARTIST]
|
972 |
if artist_tag:
|
973 |
+
artist_list = [str(tp[1]).removeprefix("by ") for tp in artist_tag[:4]]
|
|
|
974 |
artist_txt = f"by {format_nl_list(artist_list)}"
|
975 |
else:
|
976 |
artist_txt = ""
|
|
|
985 |
species_tag = tag_by_category[TAG_SPECIES]
|
986 |
if species_tag:
|
987 |
species_txt = (
|
988 |
+
"of a " if len(character_tag) <= 1 and len(species_tag) <= 1 else "of "
|
|
|
|
|
989 |
)
|
990 |
species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
|
991 |
else:
|
992 |
if character_tag:
|
993 |
species_txt = (
|
994 |
+
"of a character" if len(character_tag) <= 1 else "of characters"
|
995 |
)
|
996 |
else:
|
997 |
species_txt = ""
|
|
|
1002 |
copyright_txt = f"from {format_nl_list(tags)}"
|
1003 |
else:
|
1004 |
copyright_txt = ""
|
1005 |
+
|
1006 |
+
# Prepare the remaining tags as a string
|
1007 |
tag_string = ", ".join(tp[1] for tp in other_tags)
|
1008 |
+
|
1009 |
+
# Extract the prefix and suffix around the word "image" from the base prompt
|
1010 |
+
image_pos = base_prompt.find("image")
|
1011 |
+
if image_pos < 0:
|
1012 |
+
raise ValueError("Base prompt must contain the word 'image'")
|
1013 |
+
image_pos += len("image")
|
1014 |
+
base_prompt_prefix = base_prompt[:image_pos].rstrip()
|
1015 |
+
base_prompt_suffix = base_prompt[image_pos:].lstrip()
|
1016 |
+
|
1017 |
+
pieces = [
|
1018 |
+
base_prompt_prefix,
|
1019 |
+
artist_txt,
|
1020 |
+
species_txt,
|
1021 |
+
character_txt,
|
1022 |
+
copyright_txt,
|
1023 |
+
base_prompt_suffix,
|
1024 |
+
tag_string_prefix,
|
1025 |
tag_string,
|
1026 |
+
".",
|
1027 |
+
]
|
1028 |
+
logging.debug("Prompt pieces: %r", pieces)
|
1029 |
+
custom_prompt = " ".join(p for p in pieces if p)
|
1030 |
+
custom_prompt = custom_prompt.replace(" .", ".").replace(" ,", ",")
|
1031 |
return custom_prompt
|
1032 |
|
1033 |
|
|
|
1053 |
return None
|
1054 |
|
1055 |
|
1056 |
+
def prompt_from_folder(prompt, path):
|
1057 |
+
artist = (
|
1058 |
+
path.parent.name.replace("_", " ").replace("-", " ").replace(".", " ").title()
|
1059 |
+
)
|
1060 |
+
return prompt.replace("image", f"image by {artist}")
|
1061 |
+
|
1062 |
+
|
1063 |
if __name__ == "__main__":
|
1064 |
main()
|
keyframe
CHANGED
File without changes
|
ogg2wav
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/zsh
|
2 |
+
|
3 |
+
# Function to convert ogg to wav
|
4 |
+
convert_ogg_to_wav() {
|
5 |
+
local input_file="$1"
|
6 |
+
local output_file="${input_file:r}.wav"
|
7 |
+
ffmpeg -i "$input_file" "$output_file"
|
8 |
+
echo "Converted: $input_file -> $output_file"
|
9 |
+
}
|
10 |
+
|
11 |
+
# Set the target directory
|
12 |
+
if [[ $# -eq 0 ]]; then
|
13 |
+
target_dir="."
|
14 |
+
else
|
15 |
+
target_dir="$1"
|
16 |
+
fi
|
17 |
+
|
18 |
+
# Check if the target directory exists
|
19 |
+
if [[ ! -d "$target_dir" ]]; then
|
20 |
+
echo "Error: Directory '$target_dir' does not exist."
|
21 |
+
exit 1
|
22 |
+
fi
|
23 |
+
|
24 |
+
# Find all .ogg files in the target directory and its subdirectories
|
25 |
+
ogg_files=($(find "$target_dir" -type f -name "*.ogg"))
|
26 |
+
|
27 |
+
# Check if any .ogg files were found
|
28 |
+
if [[ ${#ogg_files[@]} -eq 0 ]]; then
|
29 |
+
echo "No .ogg files found in '$target_dir' or its subdirectories."
|
30 |
+
exit 0
|
31 |
+
fi
|
32 |
+
|
33 |
+
# Convert each .ogg file to .wav
|
34 |
+
for file in "${ogg_files[@]}"; do
|
35 |
+
convert_ogg_to_wav "$file"
|
36 |
+
done
|
37 |
+
|
38 |
+
echo "Conversion complete."
|
paper-qa.code-workspace
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"folders": [
|
3 |
-
{
|
4 |
-
"path": "."
|
5 |
-
},
|
6 |
-
{
|
7 |
-
"path": "../miniconda3/lib/python3.12/site-packages/paperqa"
|
8 |
-
}
|
9 |
-
],
|
10 |
-
"settings": {}
|
11 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
remove_extra_whitespace
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
"""
|
5 |
+
This script removes all extra spaces (more than one) and new line characters (truncating to one single character)
|
6 |
+
from all *.caption and *.txt files in a target directory recursively. If no target directory is provided as an
|
7 |
+
argument, it processes the current directory.
|
8 |
+
|
9 |
+
Usage:
|
10 |
+
python script_name.py [target_directory]
|
11 |
+
|
12 |
+
Args:
|
13 |
+
target_directory (str, optional): The path to the target directory. If not provided, the current directory is used.
|
14 |
+
"""
|
15 |
+
|
16 |
+
import os
|
17 |
+
import sys
|
18 |
+
import glob
|
19 |
+
|
20 |
+
def remove_extra_spaces_and_newlines(file_path):
|
21 |
+
"""
|
22 |
+
Removes extra spaces (more than one) and new line characters from the given file.
|
23 |
+
Truncates the text to a single space or new line character without removing any text.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
file_path (str): The path to the file to be processed.
|
27 |
+
"""
|
28 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
29 |
+
content = file.read()
|
30 |
+
|
31 |
+
# Replace multiple spaces with a single space
|
32 |
+
content = ' '.join(content.split())
|
33 |
+
|
34 |
+
# Replace multiple newlines with a single newline
|
35 |
+
content = '\n'.join(line.strip() for line in content.split('\n'))
|
36 |
+
|
37 |
+
with open(file_path, 'w', encoding='utf-8') as file:
|
38 |
+
file.write(content)
|
39 |
+
|
40 |
+
def process_files_in_directory(directory):
|
41 |
+
"""
|
42 |
+
Processes all *.caption and *.txt files in the given directory recursively.
|
43 |
+
Removes extra spaces and new line characters from each file.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
directory (str): The path to the directory to be processed.
|
47 |
+
"""
|
48 |
+
for file_path in glob.glob(os.path.join(directory, '**', '*.caption'), recursive=True):
|
49 |
+
remove_extra_spaces_and_newlines(file_path)
|
50 |
+
for file_path in glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True):
|
51 |
+
remove_extra_spaces_and_newlines(file_path)
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
if len(sys.argv) > 1:
|
55 |
+
target_directory = sys.argv[1]
|
56 |
+
else:
|
57 |
+
target_directory = os.getcwd()
|
58 |
+
|
59 |
+
process_files_in_directory(target_directory)
|
60 |
+
|
remove_tag
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import pathlib
|
5 |
+
import re
|
6 |
+
|
7 |
+
def remove_word_from_file(file_path, word):
|
8 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
9 |
+
content = file.read()
|
10 |
+
|
11 |
+
# Remove the word with the comma and space if there is one after it
|
12 |
+
pattern = re.compile(r'\b' + re.escape(word) + r',?\s?')
|
13 |
+
new_content = pattern.sub('', content)
|
14 |
+
|
15 |
+
with open(file_path, 'w', encoding='utf-8') as file:
|
16 |
+
file.write(new_content)
|
17 |
+
|
18 |
+
def remove_word_from_directory(directory, word):
|
19 |
+
path = pathlib.Path(directory)
|
20 |
+
for txt_file in path.rglob('*.txt'):
|
21 |
+
remove_word_from_file(txt_file, word)
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
import sys
|
25 |
+
if len(sys.argv) != 3:
|
26 |
+
print("Usage: python script.py <directory> <word>")
|
27 |
+
sys.exit(1)
|
28 |
+
|
29 |
+
target_directory = sys.argv[1]
|
30 |
+
target_word = sys.argv[2]
|
31 |
+
|
32 |
+
remove_word_from_directory(target_directory, target_word)
|
33 |
+
|
stats
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from collections import Counter
|
6 |
+
|
7 |
+
# ANSI color codes
|
8 |
+
|
9 |
+
RED = "\033[91m"
|
10 |
+
GREEN = "\033[92m"
|
11 |
+
ORANGE = "\033[93m"
|
12 |
+
BLUE = "\033[94m"
|
13 |
+
MAGENTA = "\033[95m"
|
14 |
+
CYAN = "\033[96m"
|
15 |
+
RESET = "\033[0m"
|
16 |
+
|
17 |
+
EXT2COLOR = {
|
18 |
+
"jxl": CYAN,
|
19 |
+
"png": MAGENTA,
|
20 |
+
"jpg": RED,
|
21 |
+
"jpeg": RED,
|
22 |
+
"webp": MAGENTA,
|
23 |
+
"caption": BLUE,
|
24 |
+
"txt": BLUE,
|
25 |
+
}
|
26 |
+
EXT2ORDER = {ext: i for i, ext in enumerate(EXT2COLOR.keys())}
|
27 |
+
SORT_KEYS = ["name", "count", "image", "text", *EXT2COLOR.keys()]
|
28 |
+
|
29 |
+
TEXT_FORMATS = {"txt", "caption"}
|
30 |
+
IMAGE_FORMATS = EXT2COLOR.keys() - TEXT_FORMATS
|
31 |
+
|
32 |
+
|
33 |
+
def count_files(directory):
|
34 |
+
file_counts = Counter()
|
35 |
+
for root, dirs, files in os.walk(directory):
|
36 |
+
for file in files:
|
37 |
+
base_name, ext = os.path.splitext(file)
|
38 |
+
if len(ext) > 1:
|
39 |
+
ext = ext[1:]
|
40 |
+
file_counts[ext] += 1
|
41 |
+
if ext in IMAGE_FORMATS:
|
42 |
+
file_counts["image"] += 1
|
43 |
+
elif ext in TEXT_FORMATS:
|
44 |
+
file_counts["text"] += 1
|
45 |
+
|
46 |
+
return file_counts
|
47 |
+
|
48 |
+
|
49 |
+
def main():
|
50 |
+
sort_key_name = "name"
|
51 |
+
sort_reverse = False
|
52 |
+
if len(sys.argv) > 1:
|
53 |
+
sort_key_name = sys.argv[1]
|
54 |
+
if sort_key_name.endswith("_r"):
|
55 |
+
sort_reverse = True
|
56 |
+
sort_key_name = sort_key_name[:-2]
|
57 |
+
|
58 |
+
if sort_key_name == "name":
|
59 |
+
sort_key = lambda x: x[0]
|
60 |
+
elif sort_key_name == "count":
|
61 |
+
sort_key = lambda x: x[1]
|
62 |
+
elif sort_key_name in SORT_KEYS:
|
63 |
+
sort_key = lambda x: x[2].get(sort_key_name, 0)
|
64 |
+
else:
|
65 |
+
print(f'Valid short key are {", ".join(f'"{k}"' for k in SORT_KEYS)}')
|
66 |
+
print('Prepending "_r" to reverse the sort order')
|
67 |
+
sys.exit(1)
|
68 |
+
|
69 |
+
current_directory = os.getcwd()
|
70 |
+
directories = (
|
71 |
+
d
|
72 |
+
for d in os.listdir(current_directory)
|
73 |
+
if os.path.isdir(os.path.join(current_directory, d))
|
74 |
+
)
|
75 |
+
|
76 |
+
stats = []
|
77 |
+
grand_total = Counter()
|
78 |
+
for directory in directories:
|
79 |
+
dir_path = os.path.join(current_directory, directory)
|
80 |
+
counts = count_files(dir_path)
|
81 |
+
total_files = sum(v for k,v in counts.items() if k in EXT2ORDER)
|
82 |
+
stats.append((directory, total_files, counts))
|
83 |
+
grand_total.update(counts)
|
84 |
+
|
85 |
+
stats.sort(key=sort_key, reverse=sort_reverse)
|
86 |
+
stats.append((None, sum(v for k,v in grand_total.items() if k in EXT2ORDER), grand_total))
|
87 |
+
|
88 |
+
for directory, total_files, counts in stats:
|
89 |
+
if total_files == 0:
|
90 |
+
continue
|
91 |
+
if directory is None:
|
92 |
+
print(f'Grand Total: ')
|
93 |
+
print(f"Directory: {directory}")
|
94 |
+
for ext, count in sorted(
|
95 |
+
counts.items(), key=lambda x: EXT2ORDER.get(x[0], -1)
|
96 |
+
):
|
97 |
+
if counts[ext] == 0 or ext not in EXT2COLOR:
|
98 |
+
continue
|
99 |
+
print(f"{EXT2COLOR[ext]}{ext} files: {counts[ext]}{RESET}")
|
100 |
+
tally_color = GREEN if total_files >= 200 else ORANGE
|
101 |
+
print(
|
102 |
+
f"{tally_color}Total files: {total_files}{RESET} ({counts['image']} images, {counts['text']} texts)"
|
103 |
+
)
|
104 |
+
print()
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
main()
|
whisper
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from transformers import pipeline
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
|
9 |
+
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
10 |
+
BATCH_SIZE = 8
|
11 |
+
|
12 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
13 |
+
|
14 |
+
pipe = pipeline(
|
15 |
+
task="automatic-speech-recognition",
|
16 |
+
model=MODEL_NAME,
|
17 |
+
chunk_length_s=30,
|
18 |
+
device=device,
|
19 |
+
)
|
20 |
+
|
21 |
+
def transcribe(audio_file_path, task="transcribe"):
|
22 |
+
if not os.path.exists(audio_file_path):
|
23 |
+
print(f"Error: The file '{audio_file_path}' does not exist.")
|
24 |
+
return
|
25 |
+
|
26 |
+
try:
|
27 |
+
text = pipe(audio_file_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
28 |
+
return text
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error during transcription: {str(e)}")
|
31 |
+
return None
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
if len(sys.argv) < 2:
|
35 |
+
print("Usage: python script.py <audio_file_path> [task]")
|
36 |
+
print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
|
37 |
+
sys.exit(1)
|
38 |
+
|
39 |
+
audio_file_path = sys.argv[1]
|
40 |
+
task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
|
41 |
+
|
42 |
+
if task not in ["transcribe", "translate"]:
|
43 |
+
print("Error: task must be either 'transcribe' or 'translate'")
|
44 |
+
sys.exit(1)
|
45 |
+
|
46 |
+
result = transcribe(audio_file_path, task)
|
47 |
+
if result:
|
48 |
+
print("Transcription result:")
|
49 |
+
print(result)
|
whisper2
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
import warnings
|
9 |
+
|
10 |
+
# Suppress specific warnings
|
11 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
12 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
13 |
+
|
14 |
+
MODEL_NAME = "openai/whisper-large-v3"
|
15 |
+
BATCH_SIZE = 8
|
16 |
+
|
17 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
18 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
19 |
+
|
20 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
21 |
+
MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
22 |
+
)
|
23 |
+
model.to(device)
|
24 |
+
|
25 |
+
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
26 |
+
|
27 |
+
pipe = pipeline(
|
28 |
+
"automatic-speech-recognition",
|
29 |
+
model=model,
|
30 |
+
tokenizer=processor.tokenizer,
|
31 |
+
feature_extractor=processor.feature_extractor,
|
32 |
+
# max_new_tokens=448,
|
33 |
+
chunk_length_s=30,
|
34 |
+
batch_size=BATCH_SIZE,
|
35 |
+
return_timestamps=True,
|
36 |
+
torch_dtype=torch_dtype,
|
37 |
+
device=device,
|
38 |
+
)
|
39 |
+
|
40 |
+
def transcribe(audio_file_path, task="transcribe"):
|
41 |
+
if not os.path.exists(audio_file_path):
|
42 |
+
print(f"Error: The file '{audio_file_path}' does not exist.")
|
43 |
+
return
|
44 |
+
|
45 |
+
try:
|
46 |
+
with torch.no_grad():
|
47 |
+
result = pipe(audio_file_path, generate_kwargs={"task": task})
|
48 |
+
from pprint import pprint
|
49 |
+
pprint(result)
|
50 |
+
return result["text"]
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error during transcription: {str(e)}")
|
53 |
+
return None
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
if len(sys.argv) < 2:
|
57 |
+
print("Usage: python script.py <audio_file_path> [task]")
|
58 |
+
print("task can be 'transcribe' or 'translate' (default is 'transcribe')")
|
59 |
+
sys.exit(1)
|
60 |
+
|
61 |
+
audio_file_path = sys.argv[1]
|
62 |
+
task = sys.argv[2] if len(sys.argv) > 2 else "transcribe"
|
63 |
+
|
64 |
+
if task not in ["transcribe", "translate"]:
|
65 |
+
print("Error: task must be either 'transcribe' or 'translate'")
|
66 |
+
sys.exit(1)
|
67 |
+
|
68 |
+
result = transcribe(audio_file_path, task)
|
69 |
+
if result:
|
70 |
+
print("Transcription result:")
|
71 |
+
print(result)
|
72 |
+
|
zsh/png2mp4.zsh
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# png2mp4_x265()
|
2 |
+
# Converts a series of PNG images to an MP4 video using x265 encoding
|
3 |
+
#
|
4 |
+
# Usage: png2mp4_x265 [--max <number>] [--step <number>] [--repeat <number>]
|
5 |
+
#
|
6 |
+
# Options:
|
7 |
+
# --max <number> : Maximum number of images to process
|
8 |
+
# --step <number> : Multiplier for step count in overlay text (default: 1)
|
9 |
+
# --repeat <number> : Number of times to repeat each image (default: 1)
|
10 |
+
#
|
11 |
+
# This function:
|
12 |
+
# 1. Deactivates conda environment
|
13 |
+
# 2. Finds all PNG files in the current directory
|
14 |
+
# 3. Uses the current directory name as the output filename prefix
|
15 |
+
# 4. Uses ffmpeg to create an MP4 with x265 encoding, including:
|
16 |
+
# - Frame rate of 60 fps
|
17 |
+
# - Image scaling to 1024x1024
|
18 |
+
# - Step count overlay text (divided by 1,000,000 and truncated to remove 6 zeros and decimal places)
|
19 |
+
# - High-quality encoding settings
|
20 |
+
# 5. Adds padding and fade-out effect to the final video
|
21 |
+
# 6. Reactivates conda environment
|
22 |
+
#
|
23 |
+
# Requirements:
|
24 |
+
# - ffmpeg with libx265 support
|
25 |
+
# - Inconsolata-Light.ttf font in /usr/share/fonts/TTF/
|
26 |
+
png2mp4_x265() {
|
27 |
+
conda deactivate
|
28 |
+
local step_multiplier=1
|
29 |
+
local repeat=1
|
30 |
+
local frame_rate=60
|
31 |
+
local resolution="1024x1024"
|
32 |
+
|
33 |
+
while [[ "$#" -gt 0 ]]; do
|
34 |
+
case $1 in
|
35 |
+
--step) step_multiplier="$2"; shift ;;
|
36 |
+
--repeat) repeat="$2"; shift ;;
|
37 |
+
*) echo "Unknown parameter passed: $1"; return 1 ;;
|
38 |
+
esac
|
39 |
+
shift
|
40 |
+
done
|
41 |
+
output_filename="$(basename "$(pwd)")_x265.mp4"
|
42 |
+
echo "Output filename: $output_filename"
|
43 |
+
|
44 |
+
local nframes=$(find . -type f -name '*.png' | wc -l)
|
45 |
+
local duration=$(($nframes * $repeat / ${frame_rate}.))
|
46 |
+
local fade_start=$((duration + 3))
|
47 |
+
echo "Found $nframes for a duration of $duration seconds"
|
48 |
+
|
49 |
+
echo "Running ffmpeg with x265 encoding..."
|
50 |
+
local font=/usr/share/fonts/TTF/Inconsolata-Light.ttf
|
51 |
+
local drawtext="drawtext=fontfile=${font}:text='Steps\: %{eif\\:trunc(n*$step_multiplier)\\:u\\:3}':x=10:y=h-th-10:fontsize=24:fontcolor=white"
|
52 |
+
local fadeout="tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=${fade_start}:d=5"
|
53 |
+
local encoder=(
|
54 |
+
-pix_fmt yuv420p
|
55 |
+
-c:v libx265
|
56 |
+
-preset slower
|
57 |
+
-tune animation
|
58 |
+
-crf 22
|
59 |
+
-x265-params "keyint=${repeat}:min-keyint=$((repeat-1)):scenecut=0:ref=5:bframes=8:b-adapt=2:rc-lookahead=$((2*repeat)):lookahead-slices=4:aq-mode=3:aq-strength=0.8:deblock=-1,-1:sao=0"
|
60 |
+
)
|
61 |
+
|
62 |
+
ffmpeg -framerate "$frame_rate/$repeat" -pattern_type glob -i "*.png" \
|
63 |
+
-vf "scale=${resolution},${drawtext},fps=${frame_rate},${fadeout}" \
|
64 |
+
"${encoder[@]}" -y sample_x265.mp4
|
65 |
+
if [ $? -ne 0 ]; then
|
66 |
+
echo "Error: ffmpeg command failed."
|
67 |
+
return 1
|
68 |
+
fi
|
69 |
+
|
70 |
+
conda activate
|
71 |
+
echo "Process completed successfully."
|
72 |
+
}
|
73 |
+
|
74 |
+
# png2mp4()
|
75 |
+
# Converts a series of PNG images to an MP4 video using x264 encoding
|
76 |
+
#
|
77 |
+
# Usage: png2mp4 [--max <number>] [--step <number>] [--repeat <number>]
|
78 |
+
#
|
79 |
+
# Options:
|
80 |
+
# --max <number> : Maximum number of images to process
|
81 |
+
# --step <number> : Multiplier for step count in overlay text (default: 1)
|
82 |
+
# --repeat <number> : Number of times to repeat each image (default: 1)
|
83 |
+
#
|
84 |
+
# This function:
|
85 |
+
# 1. Deactivates conda environment
|
86 |
+
# 2. Creates a temporary directory for processing
|
87 |
+
# 3. Finds all PNG files in the current directory
|
88 |
+
# 4. Uses the current directory name as the output filename prefix
|
89 |
+
# 5. Copies and optionally repeats images to the temp directory
|
90 |
+
# 6. Uses ffmpeg to create an MP4 with x264 encoding, including:
|
91 |
+
# - Frame rate of 60 fps
|
92 |
+
# - Image scaling to 1024x1024
|
93 |
+
# - Step count overlay text (divided by 1,000,000 and truncated to remove 6 zeros and decimal places)
|
94 |
+
# - CRF value of 28 for compression
|
95 |
+
# 7. Adds padding and fade-out effect to the final video
|
96 |
+
# 8. Cleans up temporary files
|
97 |
+
# 9. Reactivates conda environment
|
98 |
+
#
|
99 |
+
# Requirements:
|
100 |
+
# - ffmpeg with libx264 support
|
101 |
+
# - bc (basic calculator)
|
102 |
+
# - Inconsolata-Light.ttf font in /usr/share/fonts/TTF/
|
103 |
+
png2mp4() {
|
104 |
+
conda deactivate
|
105 |
+
local max_images=""
|
106 |
+
local step_multiplier=1
|
107 |
+
local repeat=1
|
108 |
+
local temp_dir="/home/kade/.local/tmp"
|
109 |
+
|
110 |
+
while [[ "$#" -gt 0 ]]; do
|
111 |
+
case $1 in
|
112 |
+
--max) max_images="$2"; shift ;;
|
113 |
+
--step) step_multiplier="$2"; shift ;;
|
114 |
+
--repeat) repeat="$2"; shift ;;
|
115 |
+
*) echo "Unknown parameter passed: $1"; return 1 ;;
|
116 |
+
esac
|
117 |
+
shift
|
118 |
+
done
|
119 |
+
|
120 |
+
echo "Creating temporary directory..."
|
121 |
+
mkdir -p "$temp_dir"
|
122 |
+
|
123 |
+
echo "Checking for PNG files..."
|
124 |
+
png_files=($(/usr/bin/env ls *.png 2>/dev/null))
|
125 |
+
if [ ${#png_files[@]} -eq 0 ]; then
|
126 |
+
echo "Error: No PNG files found in the current directory."
|
127 |
+
return 1
|
128 |
+
fi
|
129 |
+
|
130 |
+
echo "Setting output filename..."
|
131 |
+
output_filename="$(basename "$(pwd)").mp4"
|
132 |
+
echo "Output filename: $output_filename"
|
133 |
+
|
134 |
+
echo "Creating repeated images..."
|
135 |
+
for img in "${png_files[@]}"; do
|
136 |
+
for i in $(seq 1 $repeat); do
|
137 |
+
cp "$img" "$temp_dir/${img%.*}_${i}.png"
|
138 |
+
done
|
139 |
+
done
|
140 |
+
|
141 |
+
echo "Running ffmpeg..."
|
142 |
+
if [[ -n "$max_images" ]]; then
|
143 |
+
ffmpeg -framerate 60 -pattern_type glob -i "$temp_dir/*.png" -vf "scale=1024x1024,select='not(mod(n\,$max_images))',drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{expr\:trunc(n*$step_multiplier/$repeat/1000000)}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
|
144 |
+
-c:v libx264 -pix_fmt yuv420p -y "$temp_dir/temp.mp4"
|
145 |
+
else
|
146 |
+
ffmpeg -framerate 60 -pattern_type glob -i "$temp_dir/*.png" -vf "scale=1024x1024,drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{expr\:trunc(n*$step_multiplier/$repeat/1000000)}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
|
147 |
+
-c:v libx264 -pix_fmt yuv420p -y "$temp_dir/temp.mp4"
|
148 |
+
fi
|
149 |
+
|
150 |
+
if [ $? -ne 0 ]; then
|
151 |
+
echo "Error: ffmpeg command failed."
|
152 |
+
rm -rf "$temp_dir"
|
153 |
+
return 1
|
154 |
+
fi
|
155 |
+
|
156 |
+
echo "Processing final video..."
|
157 |
+
duration=$(ffmpeg -i "$temp_dir/temp.mp4" 2>&1 | grep 'Duration' | awk '{print $2}' | tr -d , | awk -F: '{print ($1 * 3600) + ($2 * 60) + $3}')
|
158 |
+
fade_start=$(echo "$duration + 3" | bc)
|
159 |
+
ffmpeg -i "$temp_dir/temp.mp4" -vf "tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=$fade_start:d=5" -c:v libx264 -pix_fmt yuv420p -y "$output_filename"
|
160 |
+
|
161 |
+
if [ $? -ne 0 ]; then
|
162 |
+
echo "Error: Final ffmpeg processing failed."
|
163 |
+
rm -rf "$temp_dir"
|
164 |
+
return 1
|
165 |
+
fi
|
166 |
+
|
167 |
+
echo "Cleaning up temporary files..."
|
168 |
+
rm -rf "$temp_dir"
|
169 |
+
|
170 |
+
conda activate
|
171 |
+
echo "Process completed successfully."
|
172 |
+
}
|