File size: 4,684 Bytes
cfb7702
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
model:
  base_learning_rate: 1.0e-04
  target: sgm.models.video_diffusion.DiffusionEngine
  params:
    ckpt_path: ckpts/V3D_512.ckpt
    scale_factor: 0.18215
    disable_first_stage_autocast: true
    input_key: latents
    log_keys: []
    scheduler_config:
      target: sgm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps:
        - 1
        cycle_lengths:
        - 10000000000000
        f_start:
        - 1.0e-06
        f_max:
        - 1.0
        f_min:
        - 1.0
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.Denoiser
      params:
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
    network_config:
      target: sgm.modules.diffusionmodules.video_model.VideoUNet
      params:
        adm_in_channels: 768
        num_classes: sequential
        use_checkpoint: true
        in_channels: 8
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        - 4
        num_head_channels: 64
        use_linear_in_transformer: true
        transformer_depth: 1
        context_dim: 1024
        spatial_transformer_attn_type: softmax-xformers
        extra_ff_mix_layer: true
        use_spatial_context: true
        merge_strategy: learned_with_images
        video_kernel_size:
        - 3
        - 1
        - 1
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
        - is_trainable: false
          ucg_rate: 0.2
          input_key: cond_frames_without_noise
          target: sgm.modules.encoders.modules.IdentityEncoder
        - input_key: fps_id
          is_trainable: true
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - input_key: motion_bucket_id
          is_trainable: true
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - input_key: cond_frames
          is_trainable: false
          ucg_rate: 0.2
          target: sgm.modules.encoders.modules.IdentityEncoder
        - input_key: cond_aug
          is_trainable: true
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencodingEngine
      params:
        loss_config:
          target: torch.nn.Identity
        regularizer_config:
          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
        encoder_config:
          target: sgm.modules.diffusionmodules.model.Encoder
          params:
            attn_type: vanilla
            double_z: true
            z_channels: 4
            resolution: 256
            in_channels: 3
            out_ch: 3
            ch: 128
            ch_mult:
            - 1
            - 2
            - 4
            - 4
            num_res_blocks: 2
            attn_resolutions: []
            dropout: 0.0
        decoder_config:
          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
          params:
            attn_type: vanilla
            double_z: true
            z_channels: 4
            resolution: 256
            in_channels: 3
            out_ch: 3
            ch: 128
            ch_mult:
            - 1
            - 2
            - 4
            - 4
            num_res_blocks: 2
            attn_resolutions: []
            dropout: 0.0
            video_kernel_size:
            - 3
            - 1
            - 1
    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        num_steps: 30
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
          params:
            sigma_max: 700.0
        guider_config:
          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
          params:
            max_scale: 3.5
            min_scale: 3.5
            num_frames: 18
    loss_fn_config:
      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
      params:
        batch2model_keys:
        - num_video_frames
        - image_only_indicator
        loss_weighting_config:
          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
          params:
            sigma_data: 1.0
        sigma_sampler_config:
          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
          params:
            p_mean: 1.5
            p_std: 2.0