k4d3 commited on
Commit
3f4436c
1 Parent(s): 913d039

we just need to fix something

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

.zshrc CHANGED
@@ -912,33 +912,65 @@ import safetensors, json
912
  filePath = '$filePath'
913
  print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
914
  }
 
915
  # Function: png2mp4
916
  # Description:
917
  # This function converts a series of PNG images into an MP4 video file using ffmpeg.
918
  #
919
  # Usage:
920
- # png2mp4
921
  #
922
  # Parameters:
923
- # None (uses all PNG files in the current directory)
924
  #
925
  # Actions:
926
  # 1. Sets the frame rate to 8 fps
927
  # 2. Uses glob pattern to include all PNG files in the current directory
928
- # 3. Scales the output video to 1024x1024 resolution
929
- # 4. Sets the Constant Rate Factor (CRF) to 28 for good compression
930
- # 5. Uses the libx264 codec for H.264 encoding
931
- # 6. Sets the pixel format to yuv420p for compatibility
932
- # 7. Outputs the result as 'out.mp4' in the current directory
 
933
  #
934
  # Notes:
935
  # - Requires ffmpeg to be installed and accessible in the system path
936
  # - PNG files should be in the current directory
937
  # - Output video will be named 'out.mp4' and placed in the current directory
938
  # - Adjust the framerate, scale, or CRF value as needed for different results
 
939
  png2mp4() {
940
- ffmpeg -framerate 8 -pattern_type glob -i '*.png' -vf scale=1024x1024 -crf 28 \
941
- -c:v libx264 -pix_fmt yuv420p out.mp4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
  }
943
 
944
  # Function: c
 
912
  filePath = '$filePath'
913
  print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
914
  }
915
+
916
  # Function: png2mp4
917
  # Description:
918
  # This function converts a series of PNG images into an MP4 video file using ffmpeg.
919
  #
920
  # Usage:
921
+ # png2mp4 [--max <number>]
922
  #
923
  # Parameters:
924
+ # --max <number> (optional): Stops the video at the specified number of images.
925
  #
926
  # Actions:
927
  # 1. Sets the frame rate to 8 fps
928
  # 2. Uses glob pattern to include all PNG files in the current directory
929
+ # 3. Optionally limits the number of input images based on the --max argument
930
+ # 4. Scales the output video to 1024x1024 resolution
931
+ # 5. Sets the Constant Rate Factor (CRF) to 28 for good compression
932
+ # 6. Uses the libx264 codec for H.264 encoding
933
+ # 7. Sets the pixel format to yuv420p for compatibility
934
+ # 8. Outputs the result as 'out.mp4' in the current directory
935
  #
936
  # Notes:
937
  # - Requires ffmpeg to be installed and accessible in the system path
938
  # - PNG files should be in the current directory
939
  # - Output video will be named 'out.mp4' and placed in the current directory
940
  # - Adjust the framerate, scale, or CRF value as needed for different results
941
+ # - Use the --max argument to limit the number of input images
942
  png2mp4() {
943
+ conda deactivate
944
+ local max_images=""
945
+ local step_multiplier=1
946
+ local prefix=""
947
+ while [[ "$#" -gt 0 ]]; do
948
+ case $1 in
949
+ --max) max_images="$2"; shift ;;
950
+ --step) step_multiplier="$2"; shift ;;
951
+ *) echo "Unknown parameter passed: $1"; return 1 ;;
952
+ esac
953
+ shift
954
+ done
955
+
956
+ # Extract the prefix from the first image filename
957
+ prefix=$(/usr/bin/env ls *.png | head -n 1 | cut -d'-' -f1 | tr -d '[:cntrl:]' | tr -d '[]')
958
+
959
+ if [[ -n "$max_images" ]]; then
960
+ ffmpeg -framerate 4 -pattern_type glob -i '*.png' -vf "scale=1024x1024,select='not(mod(n\,$max_images))',drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{eif\:n*$step_multiplier}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
961
+ -c:v libx264 -pix_fmt yuv420p -y temp.mp4
962
+ else
963
+ ffmpeg -framerate 4 -pattern_type glob -i '*.png' -vf "scale=1024x1024,drawtext=fontfile=/usr/share/fonts/TTF/Inconsolata-Light.ttf:text='Steps\: %{eif\:n*$step_multiplier}':x=10:y=h-th-10:fontsize=24:fontcolor=white" -crf 28 \
964
+ -c:v libx264 -pix_fmt yuv420p -y temp.mp4
965
+ fi
966
+
967
+ # Add 3 seconds of the last frame, then 5 seconds delay, and then fade out smoothly for 5 seconds
968
+ ffmpeg -i temp.mp4 -vf "tpad=stop_mode=clone:stop_duration=8,fade=t=out:st=$(ffmpeg -i temp.mp4 2>&1 | grep 'Duration' | awk '{print $2}' | tr -d , | awk -F: '{print ($1 * 3600) + ($2 * 60) + $3 + 5}'):d=5" -c:v libx264 -pix_fmt yuv420p -y "${prefix}.mp4"
969
+
970
+ # Clean up temporary file
971
+ rm temp.mp4
972
+
973
+ conda activate
974
  }
975
 
976
  # Function: c
cgrkzexw-599808/clip_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9277e041aab3e7f20a8e6ecf7248b663aac1c281daf4472c12a6e5013cf9f0cc
3
+ size 1713067838
cgrkzexw-599808/config.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: joy-caption-1
2
+ device_batch_size: 2
3
+ batch_size: 256
4
+ learning_rate: 0.0002
5
+ warmup_samples: 18000
6
+ max_samples: 600000
7
+ save_every: 50000
8
+ test_every: 50000
9
+ use_amp: true
10
+ grad_scaler: true
11
+ lr_scheduler_type: cosine
12
+ min_lr_ratio: 0.0
13
+ allow_tf32: true
14
+ seed: 69
15
+ num_workers: 8
16
+ optimizer_type: adamw
17
+ adam_beta1: 0.9
18
+ adam_beta2: 0.999
19
+ adam_eps: 1.0e-08
20
+ adam_weight_decay: 0.0
21
+ clip_grad_norm: 1.0
22
+ dataset: fancyfeast/joy-captioning-20240924a
23
+ clip_model: google/siglip-so400m-patch14-384
24
+ text_model: ../lora-train/lora_model_vwbzycxh
25
+ resume: null
26
+ gradient_checkpointing: false
27
+ test_size: 2048
28
+ grad_scaler_init: 65536.0
29
+ max_caption_length: 257
30
+ num_image_tokens: 32
31
+ adapter_type: mlp
32
+ text_model_dtype: bfloat16
33
+ pre_test: false
34
+ train_image_model: true
35
+ image_model_lr: null
36
+ train_lora: true
37
+ lora_r: 64
38
+ lora_alpha: 16
39
+ lora_dropout: 0.1
cgrkzexw-599808/image_adapter.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38db2fe263be2d494a50be4a7bbfd7b23b76f9d03e4008a1b7df97d6b27894ef
3
+ size 86067714
joy CHANGED
@@ -270,7 +270,9 @@ class JoyCaptionModel:
270
  self.clip_model.to("cuda")
271
 
272
  logging.info("Loading tokenizer")
273
- self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
 
 
274
  assert isinstance(
275
  self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
276
  )
@@ -558,7 +560,8 @@ class JoyCaptionModel:
558
  input_ids,
559
  inputs_embeds=inputs_embeds,
560
  attention_mask=attention_mask,
561
- max_new_tokens=512,
 
562
  do_sample=True,
563
  suppress_tokens=None,
564
  repetition_penalty=1.2,
@@ -996,6 +999,7 @@ def prompt_from_tags(
996
  base_prompt_suffix,
997
  tag_string_prefix,
998
  tag_string,
 
999
  ]
1000
  logging.debug("Prompt pieces: %r", pieces)
1001
  custom_prompt = " ".join(p for p in pieces if p)
 
270
  self.clip_model.to("cuda")
271
 
272
  logging.info("Loading tokenizer")
273
+ self.tokenizer = AutoTokenizer.from_pretrained(
274
+ CHECKPOINT_PATH / "text_model", use_fast=True
275
+ )
276
  assert isinstance(
277
  self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
278
  )
 
560
  input_ids,
561
  inputs_embeds=inputs_embeds,
562
  attention_mask=attention_mask,
563
+ max_new_tokens=300,
564
+ # max_length=4096,
565
  do_sample=True,
566
  suppress_tokens=None,
567
  repetition_penalty=1.2,
 
999
  base_prompt_suffix,
1000
  tag_string_prefix,
1001
  tag_string,
1002
+ ".",
1003
  ]
1004
  logging.debug("Prompt pieces: %r", pieces)
1005
  custom_prompt = " ".join(p for p in pieces if p)