# General import os from os.path import join as opj import argparse import datetime from pathlib import Path import torch import spaces import gradio as gr import tempfile import yaml from t2v_enhanced.model.video_ldm import VideoLDM # Utilities from t2v_enhanced.inference_utils import * from t2v_enhanced.model_init import * from t2v_enhanced.model_func import * on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR" parser = argparse.ArgumentParser() parser.add_argument('--public_access', action='store_true', default=True) parser.add_argument('--where_to_log', type=str, default="gradio_output") parser.add_argument('--device', type=str, default="cuda") args = parser.parse_args() Path(args.where_to_log).mkdir(parents=True, exist_ok=True) result_fol = Path(args.where_to_log).absolute() device = args.device # -------------------------- # ----- Configurations ----- # -------------------------- ckpt_file_streaming_t2v = Path("t2v_enhanced/checkpoints/streaming_t2v.ckpt").absolute() cfg_v2v = {'downscale': 1, 'upscale_size': (1280, 720), 'model_id': 'damo/Video-to-Video', 'pad': True} # -------------------------- # ----- Initialization ----- # -------------------------- ms_model = init_modelscope(device) # # zs_model = init_zeroscope(device) ad_model = init_animatediff(device) svd_model = init_svd(device) sdxl_model = init_sdxl(device) stream_cli, stream_model = init_streamingt2v_model(ckpt_file_streaming_t2v, result_fol) msxl_model = init_v2v_model(cfg_v2v) inference_generator = torch.Generator(device="cuda") # ------------------------- # ----- Functionality ----- # ------------------------- @spaces.GPU def generate(prompt, num_frames, image, model_name_stage1, model_name_stage2, seed, t, image_guidance, where_to_log=result_fol): now = datetime.datetime.now() name = prompt[:100].replace(" ", "_") + "_" + str(now.time()).replace(":", "_").replace(".", "_") if num_frames == [] or num_frames is None: num_frames = 56 else: num_frames = int(num_frames.split(" ")[0]) n_autoreg_gen = num_frames//8-8 inference_generator.manual_seed(seed) if model_name_stage1 == "ModelScopeT2V (text to video)": short_video = ms_short_gen(prompt, ms_model, inference_generator, t, device) elif model_name_stage1 == "AnimateDiff (text to video)": short_video = ad_short_gen(prompt, ad_model, inference_generator, t, device) elif model_name_stage1 == "SVD (image to video)": short_video = svd_short_gen(image, prompt, svd_model, sdxl_model, inference_generator, t, device) stream_long_gen(prompt, short_video, n_autoreg_gen, seed, t, image_guidance, name, stream_cli, stream_model) video_path = opj(where_to_log, name+".mp4") return video_path def enhance(prompt, input_to_enhance, num_frames=None, image=None, model_name_stage1=None, model_name_stage2=None, seed=33, t=50, image_guidance=9.5, result_fol=result_fol): if input_to_enhance is None: input_to_enhance = generate(prompt, num_frames, image, model_name_stage1, model_name_stage2, seed, t, image_guidance) encoded_video = video2video(prompt, input_to_enhance, result_fol, cfg_v2v, msxl_model) return encoded_video def change_visibility(value): if value == "SVD (image to video)": return gr.Image(label='Image Prompt (if not attached then SDXL will be used to generate the starting image)', show_label=True, scale=1, show_download_button=False, interactive=True, type='pil') else: return gr.Image(label='Image Prompt (first select Image-to-Video model from advanced options to enable image upload)', show_label=True, scale=1, show_download_button=False, interactive=False, type='pil') examples = [ ["Camera moving in a wide bright ice cave.", None, "24 - frames", None, "ModelScopeT2V (text to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["Explore the coral gardens of the sea: witness the kaleidoscope of colors and shapes as coral reefs provide shelter for a myriad of marine life.", None, "24 - frames", None, "ModelScopeT2V (text to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["Experience the dance of jellyfish: float through mesmerizing swarms of jellyfish, pulsating with otherworldly grace and beauty.", None, "24 - frames", None, "ModelScopeT2V (text to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["Discover the secret language of bees: delve into the complex communication system that allows bees to coordinate their actions and navigate the world.", None, "24 - frames", None, "AnimateDiff (text to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["A beagle reading a paper.", None, "24 - frames", None, "AnimateDiff (text to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["Beautiful Paris Day and Night Hyperlapse.", None, "24 - frames", None, "AnimateDiff (text to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["Fishes swimming in ocean camera moving, cinematic.", None, "24 - frames", "__assets__/fish.jpg", "SVD (image to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["A squirrel on a table full of big nuts.", None, "24 - frames", "__assets__/squirrel.jpg", "SVD (image to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ["Ants, beetles and centipede nest.", None, "24 - frames", None, "SVD (image to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], ] # examples = [ # ["Fishes swimming in ocean camera moving, cinematic.", # None, "24 - frames", "__assets__/fish.jpg", "SVD (image to video)", "MS-Vid2Vid-XL", 33, 50, 9.0], # ] # -------------------------- # ----- Gradio-Demo UI ----- # -------------------------- with gr.Blocks() as demo: gr.HTML( """
For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.