Spaces:

heheyas
/

V3D

Running on Zero

App Files Files Community

V3D / scripts /pub /V3D_512.py

heheyas

init

cfb7702 4 months ago

raw

history blame contribute delete

No virus

11.1 kB

	import math
	import os
	from glob import glob
	from pathlib import Path
	from typing import Optional

	import cv2
	import numpy as np
	import torch
	from einops import rearrange, repeat
	from fire import Fire
	import tyro
	from omegaconf import OmegaConf
	from PIL import Image
	from torchvision.transforms import ToTensor
	from mediapy import write_video
	import rembg
	from kiui.op import recenter
	from safetensors.torch import load_file as load_safetensors
	from typing import Any

	from scripts.util.detection.nsfw_and_watermark_dectection import DeepFloydDataFiltering
	from sgm.inference.helpers import embed_watermark
	from sgm.util import default, instantiate_from_config


	def get_unique_embedder_keys_from_conditioner(conditioner):
	return list(set([x.input_key for x in conditioner.embedders]))


	def get_batch(keys, value_dict, N, T, device):
	batch = {}
	batch_uc = {}

	for key in keys:
	if key == "fps_id":
	batch[key] = (
	torch.tensor([value_dict["fps_id"]])
	.to(device)
	.repeat(int(math.prod(N)))
	)
	elif key == "motion_bucket_id":
	batch[key] = (
	torch.tensor([value_dict["motion_bucket_id"]])
	.to(device)
	.repeat(int(math.prod(N)))
	)
	elif key == "cond_aug":
	batch[key] = repeat(
	torch.tensor([value_dict["cond_aug"]]).to(device),
	"1 -> b",
	b=math.prod(N),
	)
	elif key == "cond_frames":
	batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
	elif key == "cond_frames_without_noise":
	batch[key] = repeat(
	value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
	)
	else:
	batch[key] = value_dict[key]

	if T is not None:
	batch["num_video_frames"] = T

	for key in batch.keys():
	if key not in batch_uc and isinstance(batch[key], torch.Tensor):
	batch_uc[key] = torch.clone(batch[key])
	return batch, batch_uc


	def load_model(
	config: str,
	device: str,
	num_frames: int,
	num_steps: int,
	ckpt_path: Optional[str] = None,
	min_cfg: Optional[float] = None,
	max_cfg: Optional[float] = None,
	sigma_max: Optional[float] = None,
	):
	config = OmegaConf.load(config)

	config.model.params.sampler_config.params.num_steps = num_steps
	config.model.params.sampler_config.params.guider_config.params.num_frames = (
	num_frames
	)
	if max_cfg is not None:
	config.model.params.sampler_config.params.guider_config.params.max_scale = (
	max_cfg
	)
	if min_cfg is not None:
	config.model.params.sampler_config.params.guider_config.params.min_scale = (
	min_cfg
	)
	if sigma_max is not None:
	print("Overriding sigma_max to ", sigma_max)
	config.model.params.sampler_config.params.discretization_config.params.sigma_max = (
	sigma_max
	)

	config.model.params.from_scratch = False

	if ckpt_path is not None:
	config.model.params.ckpt_path = str(ckpt_path)
	if device == "cuda":
	with torch.device(device):
	model = instantiate_from_config(config.model).to(device).eval()
	else:
	model = instantiate_from_config(config.model).to(device).eval()

	return model, None


	def sample_one(
	input_path: str = "assets/test_image.png", # Can either be image file or folder with image files
	checkpoint_path: Optional[str] = None,
	num_frames: Optional[int] = None,
	num_steps: Optional[int] = None,
	fps_id: int = 1,
	motion_bucket_id: int = 300,
	cond_aug: float = 0.02,
	seed: int = 23,
	decoding_t: int = 24, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
	device: str = "cuda",
	output_folder: Optional[str] = None,
	noise: torch.Tensor = None,
	save: bool = False,
	cached_model: Any = None,
	border_ratio: float = 0.3,
	min_guidance_scale: float = 3.5,
	max_guidance_scale: float = 3.5,
	sigma_max: float = None,
	ignore_alpha: bool = False,
	):
	model_config = "scripts/pub/configs/V3D_512.yaml"
	num_frames = OmegaConf.load(
	model_config
	).model.params.sampler_config.params.guider_config.params.num_frames
	print("Detected num_frames:", num_frames)
	num_steps = default(num_steps, 25)
	output_folder = default(output_folder, f"outputs/V3D_512")
	decoding_t = min(decoding_t, num_frames)

	sd = load_safetensors("./ckpts/svd_xt.safetensors")
	clip_model_config = OmegaConf.load("configs/embedder/clip_image.yaml")
	clip_model = instantiate_from_config(clip_model_config).eval()
	clip_sd = dict()
	for k, v in sd.items():
	if "conditioner.embedders.0" in k:
	clip_sd[k.replace("conditioner.embedders.0.", "")] = v
	clip_model.load_state_dict(clip_sd)
	clip_model = clip_model.to(device)

	ae_model_config = OmegaConf.load("configs/ae/video.yaml")
	ae_model = instantiate_from_config(ae_model_config).eval()
	encoder_sd = dict()
	for k, v in sd.items():
	if "first_stage_model" in k:
	encoder_sd[k.replace("first_stage_model.", "")] = v
	ae_model.load_state_dict(encoder_sd)
	ae_model = ae_model.to(device)

	if cached_model is None:
	model, filter = load_model(
	model_config,
	device,
	num_frames,
	num_steps,
	ckpt_path=checkpoint_path,
	min_cfg=min_guidance_scale,
	max_cfg=max_guidance_scale,
	sigma_max=sigma_max,
	)
	else:
	model = cached_model
	torch.manual_seed(seed)

	need_return = True
	path = Path(input_path)
	if path.is_file():
	if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
	all_img_paths = [input_path]
	else:
	raise ValueError("Path is not valid image file.")
	elif path.is_dir():
	all_img_paths = sorted(
	[
	f
	for f in path.iterdir()
	if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
	]
	)
	need_return = False
	if len(all_img_paths) == 0:
	raise ValueError("Folder does not contain any images.")
	else:
	raise ValueError

	for input_path in all_img_paths:
	with Image.open(input_path) as image:
	# if image.mode == "RGBA":
	# image = image.convert("RGB")
	w, h = image.size

	if border_ratio > 0:
	if image.mode != "RGBA" or ignore_alpha:
	image = image.convert("RGB")
	image = np.asarray(image)
	carved_image = rembg.remove(image) # [H, W, 4]
	else:
	image = np.asarray(image)
	carved_image = image
	mask = carved_image[..., -1] > 0
	image = recenter(carved_image, mask, border_ratio=border_ratio)
	image = image.astype(np.float32) / 255.0
	if image.shape[-1] == 4:
	image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
	image = Image.fromarray((image * 255).astype(np.uint8))
	else:
	print("Ignore border ratio")
	image = image.resize((512, 512))

	image = ToTensor()(image)
	image = image * 2.0 - 1.0

	image = image.unsqueeze(0).to(device)
	H, W = image.shape[2:]
	assert image.shape[1] == 3
	F = 8
	C = 4
	shape = (num_frames, C, H // F, W // F)

	value_dict = {}
	value_dict["motion_bucket_id"] = motion_bucket_id
	value_dict["fps_id"] = fps_id
	value_dict["cond_aug"] = cond_aug
	value_dict["cond_frames_without_noise"] = clip_model(image)
	value_dict["cond_frames"] = ae_model.encode(image)
	value_dict["cond_frames"] += cond_aug * torch.randn_like(
	value_dict["cond_frames"]
	)
	value_dict["cond_aug"] = cond_aug

	with torch.no_grad():
	with torch.autocast(device):
	batch, batch_uc = get_batch(
	get_unique_embedder_keys_from_conditioner(model.conditioner),
	value_dict,
	[1, num_frames],
	T=num_frames,
	device=device,
	)
	c, uc = model.conditioner.get_unconditional_conditioning(
	batch,
	batch_uc=batch_uc,
	force_uc_zero_embeddings=[
	"cond_frames",
	"cond_frames_without_noise",
	],
	)

	for k in ["crossattn", "concat"]:
	uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
	uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
	c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
	c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

	randn = torch.randn(shape, device=device) if noise is None else noise
	randn = randn.to(device)

	additional_model_inputs = {}
	additional_model_inputs["image_only_indicator"] = torch.zeros(
	2, num_frames
	).to(device)
	additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

	def denoiser(input, sigma, c):
	return model.denoiser(
	model.model, input, sigma, c, **additional_model_inputs
	)

	samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
	model.en_and_decode_n_samples_a_time = decoding_t
	samples_x = model.decode_first_stage(samples_z)
	samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)

	os.makedirs(output_folder, exist_ok=True)
	base_count = len(glob(os.path.join(output_folder, "*.mp4")))
	video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
	# writer = cv2.VideoWriter(
	# video_path,
	# cv2.VideoWriter_fourcc(*"MP4V"),
	# fps_id + 1,
	# (samples.shape[-1], samples.shape[-2]),
	# )

	frames = (
	(rearrange(samples, "t c h w -> t h w c") * 255)
	.cpu()
	.numpy()
	.astype(np.uint8)
	)

	if save:
	write_video(video_path, frames, fps=3)

	images = []
	for frame in frames:
	images.append(Image.fromarray(frame))

	if need_return:
	return images, model


	if __name__ == "__main__":
	tyro.cli(sample_one)