Spaces:

eysho
/

CogVideo

Runtime error

App Files Files Community

CogVideo / inference /cli_vae_demo.py

eysho

Upload folder using huggingface_hub

e276be2 verified 5 months ago

raw

history blame contribute delete

3.68 kB

	"""
	This script demonstrates how to encode video frames using a pre-trained CogVideoX model with 🤗 Huggingface Diffusers.

	Note:
	This script requires the `diffusers>=0.30.0` library to be installed.
	If the video appears “completely green” and cannot be viewed, please switch to a different player to watch it. This is a normal phenomenon.
	Cost 71GB of GPU memory for encoding a 6s video at 720p resolution.

	Run the script:
	$ python cli_demo.py --model_path THUDM/CogVideoX-2b --video_path path/to/video.mp4 --output_path path/to/output

	"""

	import argparse
	import torch
	import imageio
	import numpy as np
	from diffusers import AutoencoderKLCogVideoX
	from torchvision import transforms


	def vae_demo(model_path, video_path, dtype, device):
	"""
	Loads a pre-trained AutoencoderKLCogVideoX model and encodes the video frames.

	Parameters:
	- model_path (str): The path to the pre-trained model.
	- video_path (str): The path to the video file.
	- dtype (torch.dtype): The data type for computation.
	- device (str): The device to use for computation (e.g., "cuda" or "cpu").

	Returns:
	- torch.Tensor: The encoded video frames.
	"""
	# Load the pre-trained model
	model = AutoencoderKLCogVideoX.from_pretrained(model_path, torch_dtype=dtype).to(device)

	# Load video frames
	video_reader = imageio.get_reader(video_path, "ffmpeg")
	frames = []
	for frame in video_reader:
	frames.append(frame)
	video_reader.close()

	# Transform frames to Tensor
	transform = transforms.Compose(
	[
	transforms.ToTensor(),
	]
	)
	frames_tensor = torch.stack([transform(frame) for frame in frames]).to(device)

	# Add batch dimension and reshape to [1, 3, 49, 480, 720]
	frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0).to(dtype).to(device)

	# Run the model with Encoder and Decoder
	with torch.no_grad():
	output = model(frames_tensor)

	return output


	def save_video(tensor, output_path):
	"""
	Saves the encoded video frames to a video file.

	Parameters:
	- tensor (torch.Tensor): The encoded video frames.
	- output_path (str): The path to save the output video.
	"""
	# Remove batch dimension and permute back to [49, 480, 720, 3]
	frames = tensor[0].squeeze(0).permute(1, 2, 3, 0).cpu().numpy()

	# Clip values to [0, 1] and convert to uint8
	frames = np.clip(frames, 0, 1)
	frames = (frames * 255).astype(np.uint8)

	# Save frames to video
	writer = imageio.get_writer(output_path + "/output.mp4", fps=30)
	for frame in frames:
	writer.append_data(frame)
	writer.close()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Convert a CogVideoX model to Diffusers")
	parser.add_argument("--model_path", type=str, required=True, help="The path to the CogVideoX model")
	parser.add_argument("--video_path", type=str, required=True, help="The path to the video file")
	parser.add_argument("--output_path", type=str, default="./", help="The path to save the output video")
	parser.add_argument(
	"--dtype", type=str, default="float16", help="The data type for computation (e.g., 'float16' or 'float32')"
	)
	parser.add_argument(
	"--device", type=str, default="cuda", help="The device to use for computation (e.g., 'cuda' or 'cpu')"
	)
	args = parser.parse_args()

	# Set device and dtype
	device = torch.device(args.device)
	dtype = torch.float16 if args.dtype == "float16" else torch.float32

	output = vae_demo(args.model_path, args.video_path, dtype, device)
	save_video(output, args.output_path)