we just need to fix something
Browse filesSigned-off-by: Balazs Horvath <acsipont@gmail.com>
- .zshrc +41 -9
- cgrkzexw-599808/clip_model.pt +3 -0
- cgrkzexw-599808/config.yaml +39 -0
- cgrkzexw-599808/image_adapter.pt +3 -0
- joy +6 -2
.zshrc
CHANGED
@@ -912,33 +912,65 @@ import safetensors, json
|
|
912 |
filePath = '$filePath'
|
913 |
print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
|
914 |
}
|
|
|
915 |
# Function: png2mp4
|
916 |
# Description:
|
917 |
# This function converts a series of PNG images into an MP4 video file using ffmpeg.
|
918 |
#
|
919 |
# Usage:
|
920 |
-
# png2mp4
|
921 |
#
|
922 |
# Parameters:
|
923 |
-
#
|
924 |
#
|
925 |
# Actions:
|
926 |
# 1. Sets the frame rate to 8 fps
|
927 |
# 2. Uses glob pattern to include all PNG files in the current directory
|
928 |
-
# 3.
|
929 |
-
# 4.
|
930 |
-
# 5.
|
931 |
-
# 6.
|
932 |
-
# 7.
|
|
|
933 |
#
|
934 |
# Notes:
|
935 |
# - Requires ffmpeg to be installed and accessible in the system path
|
936 |
# - PNG files should be in the current directory
|
937 |
# - Output video will be named 'out.mp4' and placed in the current directory
|
938 |
# - Adjust the framerate, scale, or CRF value as needed for different results
|
|
|
939 |
png2mp4() {
|
940 |
-
|
941 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
942 |
}
|
943 |
|
944 |
# Function: c
|
|
|
912 |
filePath = '$filePath'
|
913 |
print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
|
914 |
}
|
915 |
+
|
916 |
# Function: png2mp4
|
917 |
# Description:
|
918 |
# This function converts a series of PNG images into an MP4 video file using ffmpeg.
|
919 |
#
|
920 |
# Usage:
|
921 |
+
# png2mp4 [--max <number>]
|
922 |
#
|
923 |
# Parameters:
|
924 |
+
# --max <number> (optional): Stops the video at the specified number of images.
|
925 |
#
|
926 |
# Actions:
|
927 |
# 1. Sets the frame rate to 8 fps
|
928 |
# 2. Uses glob pattern to include all PNG files in the current directory
|
929 |
+
# 3. Optionally limits the number of input images based on the --max argument
|
930 |
+
# 4. Scales the output video to 1024x1024 resolution
|
931 |
+
# 5. Sets the Constant Rate Factor (CRF) to 28 for good compression
|
932 |
+
# 6. Uses the libx264 codec for H.264 encoding
|
933 |
+
# 7. Sets the pixel format to yuv420p for compatibility
|
934 |
+
# 8. Outputs the result as 'out.mp4' in the current directory
|
935 |
#
|
936 |
# Notes:
|
937 |
# - Requires ffmpeg to be installed and accessible in the system path
|
938 |
# - PNG files should be in the current directory
|
939 |
# - Output video will be named 'out.mp4' and placed in the current directory
|
940 |
# - Adjust the framerate, scale, or CRF value as needed for different results
|
941 |
+
# - Use the --max argument to limit the number of input images
|
942 |
png2mp4() {
|
943 |
+
conda deactivate
|
944 |
+
local max_images=""
|
945 |
+
local step_multiplier=1
|
946 |
+
local prefix=""
|
947 |
+
while [[ "$#" -gt 0 ]]; do
|
948 |
+
case $1 in
|
949 |
+
--max) max_images="$2"; shift ;;
|
950 |
+
--step) step_multiplier="$2"; shift ;;
|
951 |
+
*) echo "Unknown parameter passed: $1"; return 1 ;;
|
952 |
+
esac
|
953 |
+
shift
|
954 |
+
done
|
955 |
+
|
956 |
+
# Extract the prefix from the first image filename
|
957 |
+
prefix=$(/usr/bin/env ls *.png | head -n 1 | cut -d'-' -f1 | tr -d '[:cntrl:]' | tr -d '[]')
|
958 |
+
|
959 |
+
if [[ -n "$max_images" ]]; then
|
960 |
+
ffmpeg -framerate 4 -pattern_type glob -i '*.png' -vf "scale=1024x1024,select='not(mod(n\,$max_images))',drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{eif\:n*$step_multiplier}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
|
961 |
+
-c:v libx264 -pix_fmt yuv420p -y temp.mp4
|
962 |
+
else
|
963 |
+
ffmpeg -framerate 4 -pattern_type glob -i '*.png' -vf "scale=1024x1024,drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{eif\:n*$step_multiplier}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
|
964 |
+
-c:v libx264 -pix_fmt yuv420p -y temp.mp4
|
965 |
+
fi
|
966 |
+
|
967 |
+
# Add 3 seconds of the last frame, then 5 seconds delay, and then fade out smoothly for 5 seconds
|
968 |
+
ffmpeg -i temp.mp4 -vf "tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=$(ffmpeg -i temp.mp4 2>&1 | grep 'Duration' | awk '{print $2}' | tr -d , | awk -F: '{print ($1 * 3600) + ($2 * 60) + $3 + 5}'):d=5" -c:v libx264 -pix_fmt yuv420p -y "${prefix}.mp4"
|
969 |
+
|
970 |
+
# Clean up temporary file
|
971 |
+
rm temp.mp4
|
972 |
+
|
973 |
+
conda activate
|
974 |
}
|
975 |
|
976 |
# Function: c
|
cgrkzexw-599808/clip_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9277e041aab3e7f20a8e6ecf7248b663aac1c281daf4472c12a6e5013cf9f0cc
|
3 |
+
size 1713067838
|
cgrkzexw-599808/config.yaml
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_project: joy-caption-1
|
2 |
+
device_batch_size: 2
|
3 |
+
batch_size: 256
|
4 |
+
learning_rate: 0.0002
|
5 |
+
warmup_samples: 18000
|
6 |
+
max_samples: 600000
|
7 |
+
save_every: 50000
|
8 |
+
test_every: 50000
|
9 |
+
use_amp: true
|
10 |
+
grad_scaler: true
|
11 |
+
lr_scheduler_type: cosine
|
12 |
+
min_lr_ratio: 0.0
|
13 |
+
allow_tf32: true
|
14 |
+
seed: 69
|
15 |
+
num_workers: 8
|
16 |
+
optimizer_type: adamw
|
17 |
+
adam_beta1: 0.9
|
18 |
+
adam_beta2: 0.999
|
19 |
+
adam_eps: 1.0e-08
|
20 |
+
adam_weight_decay: 0.0
|
21 |
+
clip_grad_norm: 1.0
|
22 |
+
dataset: fancyfeast/joy-captioning-20240924a
|
23 |
+
clip_model: google/siglip-so400m-patch14-384
|
24 |
+
text_model: ../lora-train/lora_model_vwbzycxh
|
25 |
+
resume: null
|
26 |
+
gradient_checkpointing: false
|
27 |
+
test_size: 2048
|
28 |
+
grad_scaler_init: 65536.0
|
29 |
+
max_caption_length: 257
|
30 |
+
num_image_tokens: 32
|
31 |
+
adapter_type: mlp
|
32 |
+
text_model_dtype: bfloat16
|
33 |
+
pre_test: false
|
34 |
+
train_image_model: true
|
35 |
+
image_model_lr: null
|
36 |
+
train_lora: true
|
37 |
+
lora_r: 64
|
38 |
+
lora_alpha: 16
|
39 |
+
lora_dropout: 0.1
|
cgrkzexw-599808/image_adapter.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38db2fe263be2d494a50be4a7bbfd7b23b76f9d03e4008a1b7df97d6b27894ef
|
3 |
+
size 86067714
|
joy
CHANGED
@@ -270,7 +270,9 @@ class JoyCaptionModel:
|
|
270 |
self.clip_model.to("cuda")
|
271 |
|
272 |
logging.info("Loading tokenizer")
|
273 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
|
274 |
assert isinstance(
|
275 |
self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
|
276 |
)
|
@@ -558,7 +560,8 @@ class JoyCaptionModel:
|
|
558 |
input_ids,
|
559 |
inputs_embeds=inputs_embeds,
|
560 |
attention_mask=attention_mask,
|
561 |
-
max_new_tokens=
|
|
|
562 |
do_sample=True,
|
563 |
suppress_tokens=None,
|
564 |
repetition_penalty=1.2,
|
@@ -996,6 +999,7 @@ def prompt_from_tags(
|
|
996 |
base_prompt_suffix,
|
997 |
tag_string_prefix,
|
998 |
tag_string,
|
|
|
999 |
]
|
1000 |
logging.debug("Prompt pieces: %r", pieces)
|
1001 |
custom_prompt = " ".join(p for p in pieces if p)
|
|
|
270 |
self.clip_model.to("cuda")
|
271 |
|
272 |
logging.info("Loading tokenizer")
|
273 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
274 |
+
CHECKPOINT_PATH / "text_model", use_fast=True
|
275 |
+
)
|
276 |
assert isinstance(
|
277 |
self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
|
278 |
)
|
|
|
560 |
input_ids,
|
561 |
inputs_embeds=inputs_embeds,
|
562 |
attention_mask=attention_mask,
|
563 |
+
max_new_tokens=300,
|
564 |
+
# max_length=4096,
|
565 |
do_sample=True,
|
566 |
suppress_tokens=None,
|
567 |
repetition_penalty=1.2,
|
|
|
999 |
base_prompt_suffix,
|
1000 |
tag_string_prefix,
|
1001 |
tag_string,
|
1002 |
+
".",
|
1003 |
]
|
1004 |
logging.debug("Prompt pieces: %r", pieces)
|
1005 |
custom_prompt = " ".join(p for p in pieces if p)
|