Spaces:

maxin-cn
/

Latte-1

Running on Zero

App Files Files Community

Latte-1 / sample /sample_ddp.py

maxin-cn

Upload folder using huggingface_hub

94bafa8 verified 2 months ago

raw

history blame

No virus

8.33 kB

	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Samples a large number of images from a pre-trained Latte model using DDP.
	Subsequently saves a .npz file that can be used to compute FVD and other
	evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations

	For a simple single-GPU/CPU sampling script, see sample.py.
	"""
	import io
	import os
	import sys
	import torch
	sys.path.append(os.path.split(sys.path[0])[0])
	import torch.distributed as dist
	from utils import find_model
	from diffusion import create_diffusion
	from diffusers.models import AutoencoderKL
	from tqdm import tqdm
	import os
	from PIL import Image
	import numpy as np
	import math
	import argparse
	import imageio
	from omegaconf import OmegaConf
	from models import get_models
	from einops import rearrange


	def create_npz_from_sample_folder(sample_dir, num=50_000):
	"""
	Builds a single .npz file from a folder of .png samples.
	"""
	samples = []
	for i in tqdm(range(num), desc="Building .npz file from samples"):
	sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
	sample_np = np.asarray(sample_pil).astype(np.uint8)
	samples.append(sample_np)
	samples = np.stack(samples)
	assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
	npz_path = f"{sample_dir}.npz"
	np.savez(npz_path, arr_0=samples)
	print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
	return npz_path


	def main(args):
	"""
	Run sampling.
	"""
	torch.backends.cuda.matmul.allow_tf32 = True # True: fast but may lead to some small numerical differences
	assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
	torch.set_grad_enabled(False)

	# Setup DDP:
	dist.init_process_group("nccl")
	rank = dist.get_rank()
	device = rank % torch.cuda.device_count()
	if args.seed:
	seed = args.seed * dist.get_world_size() + rank
	torch.manual_seed(seed)
	torch.cuda.set_device(device)
	# print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")

	if args.ckpt is None:
	assert args.model == "Latte-XL/2", "Only Latte-XL/2 models are available for auto-download."
	assert args.image_size in [256, 512]
	assert args.num_classes == 1000

	# Load model:
	latent_size = args.image_size // 8
	args.latent_size = latent_size
	model = get_models(args).to(device)

	if args.use_compile:
	model = torch.compile(model)

	# a pre-trained model or load a custom Latte checkpoint from train.py:
	ckpt_path = args.ckpt
	state_dict = find_model(ckpt_path)
	model.load_state_dict(state_dict)
	model.eval() # important!
	diffusion = create_diffusion(str(args.num_sampling_steps))
	# vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
	# vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae").to(device)
	vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="sd-vae-ft-ema").to(device)

	if args.use_fp16:
	print('WARNING: using half percision for inferencing!')
	vae.to(dtype=torch.float16)
	model.to(dtype=torch.float16)
	# text_encoder.to(dtype=torch.float16)

	assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
	using_cfg = args.cfg_scale > 1.0

	# Create folder to save samples:
	# model_string_name = args.model.replace("/", "-")
	# ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
	# folder_name = f"{model_string_name}-{ckpt_string_name}-size-{args.image_size}-vae-{args.vae}-" \
	# f"cfg-{args.cfg_scale}-seed-{args.seed}"
	# sample_folder_dir = f"{args.sample_dir}/{folder_name}"
	sample_folder_dir = args.save_video_path
	if args.seed:
	sample_folder_dir = args.save_video_path + '-seed-' + str(args.seed)
	if rank == 0:
	os.makedirs(sample_folder_dir, exist_ok=True)
	print(f"Saving .mp4 samples at {sample_folder_dir}")
	dist.barrier()

	# Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
	n = args.per_proc_batch_size
	global_batch_size = n * dist.get_world_size()
	# To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
	total_samples = int(math.ceil(args.num_fvd_samples / global_batch_size) * global_batch_size)
	if rank == 0:
	print(f"Total number of images that will be sampled: {total_samples}")
	assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
	samples_needed_this_gpu = int(total_samples // dist.get_world_size())
	assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
	iterations = int(samples_needed_this_gpu // n)
	pbar = range(iterations)
	pbar = tqdm(pbar) if rank == 0 else pbar
	total = 0
	for _ in pbar:
	# Sample inputs:
	if args.use_fp16:
	z = torch.randn(n, args.num_frames, 4, latent_size, latent_size, dtype=torch.float16, device=device)
	else:
	z = torch.randn(n, args.num_frames, 4, latent_size, latent_size, device=device)

	# Setup classifier-free guidance:
	if using_cfg:
	z = torch.cat([z, z], 0)
	y = torch.randint(0, args.num_classes, (n,), device=device)
	y_null = torch.tensor([101] * n, device=device)
	y = torch.cat([y, y_null], dim=0)
	model_kwargs = dict(y=y, cfg_scale=args.cfg_scale, use_fp16=args.use_fp16)
	sample_fn = model.forward_with_cfg
	else:
	model_kwargs = dict(y=None, use_fp16=args.use_fp16)
	sample_fn = model.forward

	# Sample images:
	if args.sample_method == 'ddim':
	samples = diffusion.ddim_sample_loop(
	sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=False, device=device
	)
	elif args.sample_method == 'ddpm':
	samples = diffusion.p_sample_loop(
	sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=False, device=device
	)


	if using_cfg:
	samples, _ = samples.chunk(2, dim=0) # Remove null class samples

	if args.use_fp16:
	samples = samples.to(dtype=torch.float16)

	b, f, c, h, w = samples.shape
	samples = rearrange(samples, 'b f c h w -> (b f) c h w')
	samples = vae.decode(samples / 0.18215).sample
	samples = rearrange(samples, '(b f) c h w -> b f c h w', b=b)

	# Save samples to disk as individual .png files
	for i, sample in enumerate(samples):
	sample = ((sample * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous()
	index = i * dist.get_world_size() + rank + total
	# Image.fromarray(sample).save(f"{sample_folder_dir}/{index:04d}.png")
	sample_save_path = f"{sample_folder_dir}/{index:04d}.mp4"
	imageio.mimwrite(sample_save_path, sample, fps=8, quality=9)
	total += global_batch_size

	# Make sure all processes have finished saving their samples before attempting to convert to .npz
	dist.barrier()
	# if rank == 0:
	# create_npz_from_sample_folder(sample_folder_dir, args.num_fvd_samples)
	# print("Done.")
	# dist.barrier()
	dist.destroy_process_group()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--config", type=str, default="./configs/tuneavideo.yaml")
	parser.add_argument("--ckpt", type=str, default="")
	parser.add_argument("--save_video_path", type=str, default="./sample_videos/")
	parser.add_argument("--save_ceph", default=False, action='store_true')
	args = parser.parse_args()
	omega_conf = OmegaConf.load(args.config)
	omega_conf.ckpt = args.ckpt
	omega_conf.save_video_path = args.save_video_path
	omega_conf.save_ceph = args.save_ceph
	main(omega_conf)