we just need to fix something

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

Files changed (5) hide show

.zshrc +41 -9
cgrkzexw-599808/clip_model.pt +3 -0
cgrkzexw-599808/config.yaml +39 -0
cgrkzexw-599808/image_adapter.pt +3 -0
joy +6 -2

.zshrc CHANGED Viewed

@@ -912,33 +912,65 @@ import safetensors, json
 filePath = '$filePath'
 print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
 }
 # Function: png2mp4
 # Description:
 #   This function converts a series of PNG images into an MP4 video file using ffmpeg.
 #
 # Usage:
-#   png2mp4
 #
 # Parameters:
-#   None (uses all PNG files in the current directory)
 #
 # Actions:
 #   1. Sets the frame rate to 8 fps
 #   2. Uses glob pattern to include all PNG files in the current directory
-#   3. Scales the output video to 1024x1024 resolution
-#   4. Sets the Constant Rate Factor (CRF) to 28 for good compression
-#   5. Uses the libx264 codec for H.264 encoding
-#   6. Sets the pixel format to yuv420p for compatibility
-#   7. Outputs the result as 'out.mp4' in the current directory
 #
 # Notes:
 #   - Requires ffmpeg to be installed and accessible in the system path
 #   - PNG files should be in the current directory
 #   - Output video will be named 'out.mp4' and placed in the current directory
 #   - Adjust the framerate, scale, or CRF value as needed for different results
 png2mp4() {
-  ffmpeg -framerate 8 -pattern_type glob -i '*.png' -vf scale=1024x1024 -crf 28 \
-  -c:v libx264 -pix_fmt yuv420p out.mp4
 }
 # Function: c

 filePath = '$filePath'
 print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
 }
 # Function: png2mp4
 # Description:
 #   This function converts a series of PNG images into an MP4 video file using ffmpeg.
 #
 # Usage:
+#   png2mp4 [--max <number>]
 #
 # Parameters:
+#   --max <number> (optional): Stops the video at the specified number of images.
 #
 # Actions:
 #   1. Sets the frame rate to 8 fps
 #   2. Uses glob pattern to include all PNG files in the current directory
+#   3. Optionally limits the number of input images based on the --max argument
+#   4. Scales the output video to 1024x1024 resolution
+#   5. Sets the Constant Rate Factor (CRF) to 28 for good compression
+#   6. Uses the libx264 codec for H.264 encoding
+#   7. Sets the pixel format to yuv420p for compatibility
+#   8. Outputs the result as 'out.mp4' in the current directory
 #
 # Notes:
 #   - Requires ffmpeg to be installed and accessible in the system path
 #   - PNG files should be in the current directory
 #   - Output video will be named 'out.mp4' and placed in the current directory
 #   - Adjust the framerate, scale, or CRF value as needed for different results
+#   - Use the --max argument to limit the number of input images
 png2mp4() {
+  conda deactivate
+  local max_images=""
+  local step_multiplier=1
+  local prefix=""
+  while [[ "$#" -gt 0 ]]; do
+    case $1 in
+      --max) max_images="$2"; shift ;;
+      --step) step_multiplier="$2"; shift ;;
+      *) echo "Unknown parameter passed: $1"; return 1 ;;
+    esac
+    shift
+  done
+  # Extract the prefix from the first image filename
+  prefix=$(/usr/bin/env ls *.png | head -n 1 | cut -d'-' -f1 | tr -d '[:cntrl:]' | tr -d '[]')
+  if [[ -n "$max_images" ]]; then
+    ffmpeg -framerate 4 -pattern_type glob -i '*.png' -vf "scale=1024x1024,select='not(mod(n\,$max_images))',drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{eif\:n*$step_multiplier}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
+    -c:v libx264 -pix_fmt yuv420p -y temp.mp4
+  else
+    ffmpeg -framerate 4 -pattern_type glob -i '*.png' -vf "scale=1024x1024,drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{eif\:n*$step_multiplier}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
+    -c:v libx264 -pix_fmt yuv420p -y temp.mp4
+  fi
+  # Add 3 seconds of the last frame, then 5 seconds delay, and then fade out smoothly for 5 seconds
+  ffmpeg -i temp.mp4 -vf "tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=$(ffmpeg -i temp.mp4 2>&1 | grep 'Duration' | awk '{print $2}' | tr -d , | awk -F: '{print ($1 * 3600) + ($2 * 60) + $3 + 5}'):d=5" -c:v libx264 -pix_fmt yuv420p -y "${prefix}.mp4"
+  # Clean up temporary file
+  rm temp.mp4
+  conda activate
 }
 # Function: c

cgrkzexw-599808/clip_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9277e041aab3e7f20a8e6ecf7248b663aac1c281daf4472c12a6e5013cf9f0cc
+size 1713067838

cgrkzexw-599808/config.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+wandb_project: joy-caption-1
+device_batch_size: 2
+batch_size: 256
+learning_rate: 0.0002
+warmup_samples: 18000
+max_samples: 600000
+save_every: 50000
+test_every: 50000
+use_amp: true
+grad_scaler: true
+lr_scheduler_type: cosine
+min_lr_ratio: 0.0
+allow_tf32: true
+seed: 69
+num_workers: 8
+optimizer_type: adamw
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_eps: 1.0e-08
+adam_weight_decay: 0.0
+clip_grad_norm: 1.0
+dataset: fancyfeast/joy-captioning-20240924a
+clip_model: google/siglip-so400m-patch14-384
+text_model: ../lora-train/lora_model_vwbzycxh
+resume: null
+gradient_checkpointing: false
+test_size: 2048
+grad_scaler_init: 65536.0
+max_caption_length: 257
+num_image_tokens: 32
+adapter_type: mlp
+text_model_dtype: bfloat16
+pre_test: false
+train_image_model: true
+image_model_lr: null
+train_lora: true
+lora_r: 64
+lora_alpha: 16
+lora_dropout: 0.1

cgrkzexw-599808/image_adapter.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38db2fe263be2d494a50be4a7bbfd7b23b76f9d03e4008a1b7df97d6b27894ef
+size 86067714

joy CHANGED Viewed

@@ -270,7 +270,9 @@ class JoyCaptionModel:
         self.clip_model.to("cuda")
         logging.info("Loading tokenizer")
-        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
         assert isinstance(
             self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         )
@@ -558,7 +560,8 @@ class JoyCaptionModel:
             input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            max_new_tokens=512,
             do_sample=True,
             suppress_tokens=None,
             repetition_penalty=1.2,
@@ -996,6 +999,7 @@ def prompt_from_tags(
         base_prompt_suffix,
         tag_string_prefix,
         tag_string,
     ]
     logging.debug("Prompt pieces: %r", pieces)
     custom_prompt = " ".join(p for p in pieces if p)

         self.clip_model.to("cuda")
         logging.info("Loading tokenizer")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            CHECKPOINT_PATH / "text_model", use_fast=True
+        )
         assert isinstance(
             self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         )
             input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
+            max_new_tokens=300,
+            # max_length=4096,
             do_sample=True,
             suppress_tokens=None,
             repetition_penalty=1.2,
         base_prompt_suffix,
         tag_string_prefix,
         tag_string,
+        ".",
     ]
     logging.debug("Prompt pieces: %r", pieces)
     custom_prompt = " ".join(p for p in pieces if p)