Spaces:
Runtime error
Runtime error
""" | |
This script demonstrates how to encode video frames using a pre-trained CogVideoX model with 🤗 Huggingface Diffusers. | |
Note: | |
This script requires the `diffusers>=0.30.0` library to be installed. | |
If the video appears “completely green” and cannot be viewed, please switch to a different player to watch it. This is a normal phenomenon. | |
Cost 71GB of GPU memory for encoding a 6s video at 720p resolution. | |
Run the script: | |
$ python cli_demo.py --model_path THUDM/CogVideoX-2b --video_path path/to/video.mp4 --output_path path/to/output | |
""" | |
import argparse | |
import torch | |
import imageio | |
import numpy as np | |
from diffusers import AutoencoderKLCogVideoX | |
from torchvision import transforms | |
def vae_demo(model_path, video_path, dtype, device): | |
""" | |
Loads a pre-trained AutoencoderKLCogVideoX model and encodes the video frames. | |
Parameters: | |
- model_path (str): The path to the pre-trained model. | |
- video_path (str): The path to the video file. | |
- dtype (torch.dtype): The data type for computation. | |
- device (str): The device to use for computation (e.g., "cuda" or "cpu"). | |
Returns: | |
- torch.Tensor: The encoded video frames. | |
""" | |
# Load the pre-trained model | |
model = AutoencoderKLCogVideoX.from_pretrained(model_path, torch_dtype=dtype).to(device) | |
# Load video frames | |
video_reader = imageio.get_reader(video_path, "ffmpeg") | |
frames = [] | |
for frame in video_reader: | |
frames.append(frame) | |
video_reader.close() | |
# Transform frames to Tensor | |
transform = transforms.Compose( | |
[ | |
transforms.ToTensor(), | |
] | |
) | |
frames_tensor = torch.stack([transform(frame) for frame in frames]).to(device) | |
# Add batch dimension and reshape to [1, 3, 49, 480, 720] | |
frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0).to(dtype).to(device) | |
# Run the model with Encoder and Decoder | |
with torch.no_grad(): | |
output = model(frames_tensor) | |
return output | |
def save_video(tensor, output_path): | |
""" | |
Saves the encoded video frames to a video file. | |
Parameters: | |
- tensor (torch.Tensor): The encoded video frames. | |
- output_path (str): The path to save the output video. | |
""" | |
# Remove batch dimension and permute back to [49, 480, 720, 3] | |
frames = tensor[0].squeeze(0).permute(1, 2, 3, 0).cpu().numpy() | |
# Clip values to [0, 1] and convert to uint8 | |
frames = np.clip(frames, 0, 1) | |
frames = (frames * 255).astype(np.uint8) | |
# Save frames to video | |
writer = imageio.get_writer(output_path + "/output.mp4", fps=30) | |
for frame in frames: | |
writer.append_data(frame) | |
writer.close() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Convert a CogVideoX model to Diffusers") | |
parser.add_argument("--model_path", type=str, required=True, help="The path to the CogVideoX model") | |
parser.add_argument("--video_path", type=str, required=True, help="The path to the video file") | |
parser.add_argument("--output_path", type=str, default="./", help="The path to save the output video") | |
parser.add_argument( | |
"--dtype", type=str, default="float16", help="The data type for computation (e.g., 'float16' or 'float32')" | |
) | |
parser.add_argument( | |
"--device", type=str, default="cuda", help="The device to use for computation (e.g., 'cuda' or 'cpu')" | |
) | |
args = parser.parse_args() | |
# Set device and dtype | |
device = torch.device(args.device) | |
dtype = torch.float16 if args.dtype == "float16" else torch.float32 | |
output = vae_demo(args.model_path, args.video_path, dtype, device) | |
save_video(output, args.output_path) | |