File size: 2,653 Bytes
bd6c4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0216866
bd6c4af
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
data:
  train_bs: 4
  val_bs: 1
  train_width: 512
  train_height: 512
  fps: 25
  sample_rate: 16000
  n_motion_frames: 2
  n_sample_frames: 16
  audio_margin: 2
  train_meta_paths:
    - "./data/inference.json"

wav2vec_config:
  audio_type: "vocals" # audio vocals
  model_scale: "base" # base large
  features: "all" # last avg all
  model_path: ./pretrained_models/chinese-wav2vec2-base
audio_separator:
  model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
face_expand_ratio: 1.2

solver:
  gradient_accumulation_steps: 1
  mixed_precision: "no"
  enable_xformers_memory_efficient_attention: True
  gradient_checkpointing: True
  max_train_steps: 30000
  max_grad_norm: 1.0
  # lr
  learning_rate: 1e-5
  scale_lr: False
  lr_warmup_steps: 1
  lr_scheduler: "constant"

  # optimizer
  use_8bit_adam: True
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_weight_decay: 1.0e-2
  adam_epsilon: 1.0e-8

val:
  validation_steps: 1000

noise_scheduler_kwargs:
  num_train_timesteps: 1000
  beta_start: 0.00085
  beta_end: 0.012
  beta_schedule: "linear"
  steps_offset: 1
  clip_sample: false

unet_additional_kwargs:
  use_inflated_groupnorm: true
  unet_use_cross_frame_attention: false
  unet_use_temporal_attention: false
  use_motion_module: true
  use_audio_module: true
  motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
  motion_module_mid_block: true
  motion_module_decoder_only: false
  motion_module_type: Vanilla
  motion_module_kwargs:
    num_attention_heads: 8
    num_transformer_block: 1
    attention_block_types:
      - Temporal_Self
      - Temporal_Self
    temporal_position_encoding: true
    temporal_position_encoding_max_len: 32
    temporal_attention_dim_div: 1
  audio_attention_dim: 768
  stack_enable_blocks_name:
    - "up"
    - "down"
    - "mid"
  stack_enable_blocks_depth: [0,1,2,3]
  
trainable_para:
  - audio_modules
  - motion_modules

base_model_path: "./pretrained_models/stable-diffusion-v1-5"
vae_model_path: "./pretrained_models/sd-vae-ft-mse"
face_analysis_model_path: "./pretrained_models/face_analysis"
mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt"

weight_dtype: "fp16" # [fp16, fp32]
uncond_img_ratio: 0.05
uncond_audio_ratio: 0.05
uncond_ia_ratio: 0.05
start_ratio: 0.05
noise_offset: 0.05
snr_gamma: 5.0
enable_zero_snr: True
stage1_ckpt_dir: "./exp_output/stage1/"

single_inference_times: 10
inference_steps: 20 # improve inference speed
cfg_scale: 3.5

seed: 42
resume_from_checkpoint: "latest"
checkpointing_steps: 500

exp_name: "joyhallo"
output_dir: "./opts"

audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth"

ref_img_path: None
  
audio_path: None