File size: 5,366 Bytes
b3f324b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""
Sample new images from a pre-trained Latte.
"""
import os
import sys
from accelerate import Accelerator
from tqdm import tqdm
from opensora.dataset import ae_denorm
from opensora.models.ae import ae_channel_config, getae, ae_stride_config
from opensora.models.ae.videobase import CausalVQVAEModelWrapper
from opensora.models.diffusion import Diffusion_models
from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion
from opensora.models.diffusion.latte.modeling_latte import Latte
from opensora.utils.utils import find_model
import torch
import argparse
from einops import rearrange
import imageio
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
def main(args):
# Setup PyTorch:
# torch.manual_seed(args.seed)
torch.set_grad_enabled(False)
assert torch.cuda.is_available(), "Training currently requires at least one GPU."
# Setup accelerator:
accelerator = Accelerator(mixed_precision=args.mixed_precision)
device = accelerator.device
using_cfg = args.cfg_scale > 1.0
# Load model:
latent_size = (args.image_size // ae_stride_config[args.ae][1], args.image_size // ae_stride_config[args.ae][2])
args.latent_size = latent_size
model = Latte.from_pretrained(args.ckpt, subfolder="model").to(device)
model.eval() # important!
model = accelerator.prepare(model)
diffusion = create_diffusion(str(args.num_sampling_steps))
ae = getae(args).to(device)
if isinstance(ae, CausalVQVAEModelWrapper):
video_length = args.num_frames // ae_stride_config[args.ae][0] + 1
else:
video_length = args.num_frames // ae_stride_config[args.ae][0]
bar = tqdm(range(args.num_sample))
for i in bar:
# Create sampling noise:
z = torch.randn(1, model.module.in_channels, video_length, latent_size[0], latent_size[1], device=device)
# Setup classifier-free guidance:
if using_cfg and args.train_classcondition:
z = torch.cat([z, z], 0)
y = torch.randint(0, args.num_classes, (1,), device=device)
cls_id = str(int(y.detach().cpu()))
y_null = torch.tensor([args.num_classes] * 1, device=device)
y = torch.cat([y, y_null], dim=0)
model_kwargs = dict(class_labels=y, cfg_scale=args.cfg_scale)
sample_fn = model.module.forward_with_cfg
else:
if args.train_classcondition:
sample_fn = model.forward
y = torch.randint(0, args.num_classes, (1,), device=device)
cls_id = str(int(y.detach().cpu()))
model_kwargs = dict(class_labels=y)
else:
sample_fn = model.forward
model_kwargs = dict(class_labels=None)
# Sample images:
if args.sample_method == 'ddim':
samples = diffusion.ddim_sample_loop(
sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
)
elif args.sample_method == 'ddpm':
samples = diffusion.p_sample_loop(
sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
)
with torch.no_grad():
samples = ae.decode(samples)
# Save and display images:
if not os.path.exists(args.save_video_path):
os.makedirs(args.save_video_path)
video_ = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous()
if args.train_classcondition:
video_save_path = os.path.join(args.save_video_path, f"sample_{i:03d}_cls" + str(cls_id) + '.mp4')
else:
video_save_path = os.path.join(args.save_video_path, f"sample_{i:03d}" + '.mp4')
print(video_save_path)
imageio.mimwrite(video_save_path, video_, fps=args.fps, quality=9)
print('save path {}'.format(args.save_video_path))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt", type=str, default="")
parser.add_argument("--model", type=str, default='Latte-XL/122')
parser.add_argument("--ae", type=str, default='stabilityai/sd-vae-ft-mse')
parser.add_argument("--save_video_path", type=str, default="./sample_videos/")
parser.add_argument("--fps", type=int, default=10)
parser.add_argument("--num_classes", type=int, default=101)
parser.add_argument("--num_frames", type=int, default=16)
parser.add_argument("--image_size", type=int, default=256)
parser.add_argument("--train_classcondition", action="store_true")
parser.add_argument("--num_sampling_steps", type=int, default=250)
parser.add_argument("--num_sample", type=int, default=1)
parser.add_argument("--cfg_scale", type=float, default=1.0)
parser.add_argument("--sample_method", type=str, default='ddpm')
parser.add_argument("--mixed_precision", type=str, default=None, choices=[None, "fp16", "bf16"])
parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="math")
args = parser.parse_args()
main(args)
|