File size: 6,860 Bytes
413d4d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os

from huggingface_hub import hf_hub_download, snapshot_download
import torch

from videogen_hub import MODEL_PATH


class T2VTurbo():
    def __init__(self, base_model="vc2", merged=True, device="cuda"):
        """
    1. Download the pretrained model and put it inside MODEL_PATH
    2. Create Pipeline
    Args:
        device: 'cuda' or 'cpu' the device to use the model
    """
        from videogen_hub.pipelines.t2v_turbo.inference_vc2 import T2VTurboVC2Pipeline1
        from videogen_hub.pipelines.t2v_turbo.inference_ms import T2VTurboMSPipeline1

        self.config = {
            "model": {
                "target": "lvdm.models.ddpm3d.LatentDiffusion",
                "params": {
                    "linear_start": 0.00085,
                    "linear_end": 0.012,
                    "num_timesteps_cond": 1,
                    "timesteps": 1000,
                    "first_stage_key": "video",
                    "cond_stage_key": "caption",
                    "cond_stage_trainable": False,
                    "conditioning_key": "crossattn",
                    "image_size": [320, 512],
                    "channels": 4,
                    "scale_by_std": False,
                    "scale_factor": 0.18215,
                    "use_ema": False,
                    "uncond_type": "empty_seq",
                    "use_scale": True,
                    "scale_b": 0.7,
                    "unet_config": {
                        "target": "lvdm.modules.networks.openaimodel3d.UNetModel",
                        "params": {
                            "in_channels": 4,
                            "out_channels": 4,
                            "model_channels": 320,
                            "attention_resolutions": [4, 2, 1],
                            "num_res_blocks": 2,
                            "channel_mult": [1, 2, 4, 4],
                            "num_head_channels": 64,
                            "transformer_depth": 1,
                            "context_dim": 1024,
                            "use_linear": True,
                            "use_checkpoint": True,
                            "temporal_conv": True,
                            "temporal_attention": True,
                            "temporal_selfatt_only": True,
                            "use_relative_position": False,
                            "use_causal_attention": False,
                            "temporal_length": 16,
                            "addition_attention": True,
                            "fps_cond": True
                        }
                    },
                    "first_stage_config": {
                        "target": "lvdm.models.autoencoder.AutoencoderKL",
                        "params": {
                            "embed_dim": 4,
                            "monitor": "val / rec_loss",
                            "ddconfig": {
                                "double_z": True,
                                "z_channels": 4,
                                "resolution": 512,
                                "in_channels": 3,
                                "out_ch": 3,
                                "ch": 128,
                                "ch_mult": [1, 2, 4, 4],
                                "num_res_blocks": 2,
                                "attn_resolutions": [],
                                "dropout": 0.0
                            },
                            "lossconfig": {
                                "target": "torch.nn.Identity"
                            }
                        }
                    },
                    "cond_stage_config": {
                        "target": "lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder",
                        "params": {
                            "freeze": True,
                            "layer": "penultimate"
                        }
                    }
                }
            }
        }
        if base_model == "vc2" and merged:
            merged_model_path = hf_hub_download(repo_id="jiachenli-ucsb/T2V-Turbo-VC2-Merged",
                                                filename="t2v_turbo_vc2.pt",
                                                local_dir=os.path.join(MODEL_PATH, "T2V-Turbo-VC2"))
            self.pipeline = T2VTurboVC2Pipeline1(self.config, merged, device, None, merged_model_path)

        elif base_model == "vc2":
            base_model_path = hf_hub_download(repo_id="VideoCrafter/VideoCrafter2",
                                              filename="model.ckpt",
                                              local_dir=os.path.join(MODEL_PATH, "videocrafter2"))

            unet_lora_path = hf_hub_download(repo_id="jiachenli-ucsb/T2V-Turbo-VC2",
                                             filename="unet_lora.pt",
                                             local_dir=os.path.join(MODEL_PATH, "T2V-Turbo-VC2"))
            # It uses the config provided above.
            self.pipeline = T2VTurboVC2Pipeline1(self.config, merged, device, unet_lora_path, base_model_path)
        else:
            base_model_path = snapshot_download(repo_id="ali-vilab/text-to-video-ms-1.7b",
                                                local_dir=os.path.join(MODEL_PATH, "modelscope_1.7b"))

            unet_lora_path = hf_hub_download(repo_id="jiachenli-ucsb/T2V-Turbo-MS",
                                             filename="unet_lora.pt",
                                             local_dir=os.path.join(MODEL_PATH, "T2V-Turbo-MS"))

            # It uses the config provided by base_model.
            self.pipeline = T2VTurboMSPipeline1(device, unet_lora_path, base_model_path)

    def infer_one_video(
            self,
            prompt: str = None,
            size: list = [320, 512],
            seconds: int = 2,
            fps: int = 8,
            seed: int = 42,
    ):
        """
    Generates a single video based on the provided prompt and parameters.
    The output is of shape [frames, channels, height, width].
    Args:
        prompt (str, optional): The text prompt to generate the video from. Defaults to None.
        seconds (int, optional): The duration of the video in seconds. Defaults to 2.
        fps (int, optional): The frames per second of the video. Defaults to 8.
        seed (int, optional): The seed for random number generation. Defaults to 42.

    Returns:
        torch.Tensor: The generated video as a tensor.
    """
        output = self.pipeline.inference(prompt=prompt, height=size[0], width=size[1],
                                         seed=seed, num_frames=seconds * fps, fps=fps, randomize_seed=False)
        # [channels, frames, height, width] -> [frames, channels, height, width]
        output = output.squeeze().permute(1, 0, 2, 3)
        return output.cpu()