CogVideo / inference /cli_vae_demo.py
eysho's picture
Upload folder using huggingface_hub
e276be2 verified
"""
This script demonstrates how to encode video frames using a pre-trained CogVideoX model with 🤗 Huggingface Diffusers.
Note:
This script requires the `diffusers>=0.30.0` library to be installed.
If the video appears “completely green” and cannot be viewed, please switch to a different player to watch it. This is a normal phenomenon.
Cost 71GB of GPU memory for encoding a 6s video at 720p resolution.
Run the script:
$ python cli_demo.py --model_path THUDM/CogVideoX-2b --video_path path/to/video.mp4 --output_path path/to/output
"""
import argparse
import torch
import imageio
import numpy as np
from diffusers import AutoencoderKLCogVideoX
from torchvision import transforms
def vae_demo(model_path, video_path, dtype, device):
"""
Loads a pre-trained AutoencoderKLCogVideoX model and encodes the video frames.
Parameters:
- model_path (str): The path to the pre-trained model.
- video_path (str): The path to the video file.
- dtype (torch.dtype): The data type for computation.
- device (str): The device to use for computation (e.g., "cuda" or "cpu").
Returns:
- torch.Tensor: The encoded video frames.
"""
# Load the pre-trained model
model = AutoencoderKLCogVideoX.from_pretrained(model_path, torch_dtype=dtype).to(device)
# Load video frames
video_reader = imageio.get_reader(video_path, "ffmpeg")
frames = []
for frame in video_reader:
frames.append(frame)
video_reader.close()
# Transform frames to Tensor
transform = transforms.Compose(
[
transforms.ToTensor(),
]
)
frames_tensor = torch.stack([transform(frame) for frame in frames]).to(device)
# Add batch dimension and reshape to [1, 3, 49, 480, 720]
frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0).to(dtype).to(device)
# Run the model with Encoder and Decoder
with torch.no_grad():
output = model(frames_tensor)
return output
def save_video(tensor, output_path):
"""
Saves the encoded video frames to a video file.
Parameters:
- tensor (torch.Tensor): The encoded video frames.
- output_path (str): The path to save the output video.
"""
# Remove batch dimension and permute back to [49, 480, 720, 3]
frames = tensor[0].squeeze(0).permute(1, 2, 3, 0).cpu().numpy()
# Clip values to [0, 1] and convert to uint8
frames = np.clip(frames, 0, 1)
frames = (frames * 255).astype(np.uint8)
# Save frames to video
writer = imageio.get_writer(output_path + "/output.mp4", fps=30)
for frame in frames:
writer.append_data(frame)
writer.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert a CogVideoX model to Diffusers")
parser.add_argument("--model_path", type=str, required=True, help="The path to the CogVideoX model")
parser.add_argument("--video_path", type=str, required=True, help="The path to the video file")
parser.add_argument("--output_path", type=str, default="./", help="The path to save the output video")
parser.add_argument(
"--dtype", type=str, default="float16", help="The data type for computation (e.g., 'float16' or 'float32')"
)
parser.add_argument(
"--device", type=str, default="cuda", help="The device to use for computation (e.g., 'cuda' or 'cpu')"
)
args = parser.parse_args()
# Set device and dtype
device = torch.device(args.device)
dtype = torch.float16 if args.dtype == "float16" else torch.float32
output = vae_demo(args.model_path, args.video_path, dtype, device)
save_video(output, args.output_path)