diff --git a/README.md b/README.md index a7e830b11b8f67b7866abc46777b5c588245239d..7babf2c3c1236c32dbe87437adab5acb1dfb7661 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,14 @@ --- -title: FlipSketch -emoji: 🚀 -colorFrom: purple -colorTo: green -sdk: docker -pinned: false -license: mit -short_description: Sketch Animations +title: FlipSketch +emoji: 🚀 +colorFrom: blue +colorTo: green +sdk: gradio +app_file: app.py +pinned: false --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference + +# FlipSketch + +FlipSketch: Flipping assets Drawings to Text-Guided Sketch Animations \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..744792e7d503d169732b02540bc7ac677909a4fa --- /dev/null +++ b/app.py @@ -0,0 +1,271 @@ +from flask import Flask, render_template, request, jsonify +import os +import cv2 +import subprocess +import torch +import torchvision +import warnings +import numpy as np +from PIL import Image, ImageSequence +from moviepy.editor import VideoFileClip +import imageio +import uuid + +from diffusers import ( + TextToVideoSDPipeline, + AutoencoderKL, + DDPMScheduler, + DDIMScheduler, + UNet3DConditionModel, +) +import time +from transformers import CLIPTokenizer, CLIPTextModel + +from diffusers.utils import export_to_video +from gifs_filter import filter +from invert_utils import ddim_inversion as dd_inversion +from text2vid_modded import TextToVideoSDPipelineModded + + +def run_setup(): + try: + # Step 1: Install Git LFS + subprocess.run(["git", "lfs", "install"], check=True) + + # Step 2: Clone the repository + repo_url = "https://huggingface.co/Hmrishav/t2v_sketch-lora" + subprocess.run(["git", "clone", repo_url], check=True) + + # Step 3: Move the checkpoint file + source = "t2v_sketch-lora/checkpoint-2500" + destination = "./checkpoint-2500/" + os.rename(source, destination) + + print("Setup completed successfully!") + except subprocess.CalledProcessError as e: + print(f"Error during setup: {e}") + except FileNotFoundError as e: + print(f"File operation error: {e}") + except Exception as e: + print(f"Unexpected error: {e}") + +# Automatically run setup during app initialization +run_setup() + + +# Flask app setup +app = Flask(__name__) +app.config['UPLOAD_FOLDER'] = 'static/uploads' +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size +os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) + +# Environment setup +os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1" +LORA_CHECKPOINT = "checkpoint-2500" +device = 'cuda' if torch.cuda.is_available() else 'cpu' +dtype = torch.bfloat16 + +# Helper functions + +def cleanup_old_files(directory, age_in_seconds = 600): + """ + Deletes files older than a certain age in the specified directory. + + Args: + directory (str): The directory to clean up. + age_in_seconds (int): The age in seconds; files older than this will be deleted. + """ + now = time.time() + for filename in os.listdir(directory): + file_path = os.path.join(directory, filename) + # Only delete files (not directories) + if os.path.isfile(file_path): + file_age = now - os.path.getmtime(file_path) + if file_age > age_in_seconds: + try: + os.remove(file_path) + print(f"Deleted old file: {file_path}") + except Exception as e: + print(f"Error deleting file {file_path}: {e}") + +def load_frames(image: Image, mode='RGBA'): + return np.array([np.array(frame.convert(mode)) for frame in ImageSequence.Iterator(image)]) + +def save_gif(frames, path): + imageio.mimsave(path, [frame.astype(np.uint8) for frame in frames], format='GIF', duration=1/10) + +def load_image(imgname, target_size=None): + pil_img = Image.open(imgname).convert('RGB') + if target_size: + if isinstance(target_size, int): + target_size = (target_size, target_size) + pil_img = pil_img.resize(target_size, Image.Resampling.LANCZOS) + return torchvision.transforms.ToTensor()(pil_img).unsqueeze(0) # Add batch dimension + +def prepare_latents(pipe, x_aug): + with torch.cuda.amp.autocast(): + batch_size, num_frames, channels, height, width = x_aug.shape + x_aug = x_aug.reshape(batch_size * num_frames, channels, height, width) + latents = pipe.vae.encode(x_aug).latent_dist.sample() + latents = latents.view(batch_size, num_frames, -1, latents.shape[2], latents.shape[3]) + latents = latents.permute(0, 2, 1, 3, 4) + return pipe.vae.config.scaling_factor * latents + +@torch.no_grad() +def invert(pipe, inv, load_name, device="cuda", dtype=torch.bfloat16): + input_img = [load_image(load_name, 256).to(device, dtype=dtype).unsqueeze(1)] * 5 + input_img = torch.cat(input_img, dim=1) + latents = prepare_latents(pipe, input_img).to(torch.bfloat16) + inv.set_timesteps(25) + id_latents = dd_inversion(pipe, inv, video_latent=latents, num_inv_steps=25, prompt="")[-1].to(dtype) + return torch.mean(id_latents, dim=2, keepdim=True) + +def load_primary_models(pretrained_model_path): + return ( + DDPMScheduler.from_config(pretrained_model_path, subfolder="scheduler"), + CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer"), + CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder"), + AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae"), + UNet3DConditionModel.from_pretrained(pretrained_model_path, subfolder="unet"), + ) + + +def initialize_pipeline(model: str, device: str = "cuda"): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + scheduler, tokenizer, text_encoder, vae, unet = load_primary_models(model) + pipe = TextToVideoSDPipeline.from_pretrained( + pretrained_model_name_or_path="damo-vilab/text-to-video-ms-1.7b", + scheduler=scheduler, + tokenizer=tokenizer, + text_encoder=text_encoder.to(device=device, dtype=torch.bfloat16), + vae=vae.to(device=device, dtype=torch.bfloat16), + unet=unet.to(device=device, dtype=torch.bfloat16), + ) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + return pipe, pipe.scheduler + +pipe_inversion, inv = initialize_pipeline(LORA_CHECKPOINT, device) +pipe = TextToVideoSDPipelineModded.from_pretrained( + pretrained_model_name_or_path="damo-vilab/text-to-video-ms-1.7b", + scheduler=pipe_inversion.scheduler, + tokenizer=pipe_inversion.tokenizer, + text_encoder=pipe_inversion.text_encoder, + vae=pipe_inversion.vae, + unet=pipe_inversion.unet, +).to(device) + +@torch.no_grad() +def process(num_frames, num_seeds, generator, exp_dir, load_name, caption, lambda_): + pipe_inversion.to(device) + id_latents = invert(pipe_inversion, inv, load_name).to(device, dtype=dtype) + latents = id_latents.repeat(num_seeds, 1, 1, 1, 1) + generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(num_seeds)] + video_frames = pipe( + prompt=caption, + negative_prompt="", + num_frames=num_frames, + num_inference_steps=25, + inv_latents=latents, + guidance_scale=9, + generator=generator, + lambda_=lambda_, + ).frames + try: + load_name = load_name.split("/")[-1] + except: + pass + gifs = [] + for seed in range(num_seeds): + vid_name = f"{exp_dir}/mp4_logs/vid_{load_name[:-4]}-rand{seed}.mp4" + gif_name = f"{exp_dir}/gif_logs/vid_{load_name[:-4]}-rand{seed}.gif" + video_path = export_to_video(video_frames[seed], output_video_path=vid_name) + VideoFileClip(vid_name).write_gif(gif_name) + with Image.open(gif_name) as im: + frames = load_frames(im) + + frames_collect = np.empty((0, 1024, 1024), int) + for frame in frames: + frame = cv2.resize(frame, (1024, 1024))[:, :, :3] + frame = cv2.cvtColor(255 - frame, cv2.COLOR_RGB2GRAY) + + _, frame = cv2.threshold(255 - frame, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + frames_collect = np.append(frames_collect, [frame], axis=0) + + save_gif(frames_collect, gif_name) + gifs.append(gif_name) + + return gifs + + +def generate_gifs(filepath, prompt, num_seeds=5, lambda_=0): + exp_dir = "static/app_tmp" + os.makedirs(exp_dir, exist_ok=True) + gifs = process( + num_frames=10, + num_seeds=num_seeds, + generator=None, + exp_dir=exp_dir, + load_name=filepath, + caption=prompt, + lambda_=lambda_ + ) + return gifs + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/generate', methods=['POST']) +def generate(): + + directories_to_clean = [ + app.config['UPLOAD_FOLDER'], + 'static/app_tmp/mp4_logs', + 'static/app_tmp/gif_logs', + 'static/app_tmp/png_logs' + ] + + # Perform cleanup + os.makedirs('static/app_tmp', exist_ok=True) + for directory in directories_to_clean: + os.makedirs(directory, exist_ok=True) # Ensure the directory exists + cleanup_old_files(directory) + + prompt = request.form.get('prompt', '') + num_gifs = int(request.form.get('seeds', 3)) + lambda_value = 1 - float(request.form.get('lambda', 0.5)) + selected_example = request.form.get('selected_example', None) + file = request.files.get('image') + + if not file and not selected_example: + return jsonify({'error': 'No image file provided or example selected'}), 400 + + if selected_example: + # Use the selected example image + filepath = os.path.join('static', 'examples', selected_example) + unique_id = None # No need for unique ID + else: + # Save the uploaded image + unique_id = str(uuid.uuid4()) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], f"{unique_id}_uploaded_image.png") + file.save(filepath) + + generated_gifs = generate_gifs(filepath, prompt, num_seeds=num_gifs, lambda_=lambda_value) + + unique_id = str(uuid.uuid4()) + # Append unique id to each gif path + for i in range(len(generated_gifs)): + os.rename(generated_gifs[i], f"{generated_gifs[i].split('.')[0]}_{unique_id}.gif") + generated_gifs[i] = f"{generated_gifs[i].split('.')[0]}_{unique_id}.gif" + # Move the generated gifs to the static folder + + + filtered_gifs = filter(generated_gifs, filepath) + return jsonify({'gifs': filtered_gifs, 'prompt': prompt}) + +if __name__ == '__main__': + + + app.run(debug=True) \ No newline at end of file diff --git a/app_full.py b/app_full.py new file mode 100644 index 0000000000000000000000000000000000000000..d4226687715d97360f4d693f37f96f077557e792 --- /dev/null +++ b/app_full.py @@ -0,0 +1,243 @@ +from flask import Flask, render_template, request, jsonify +import os +import cv2 +import torch +import torchvision +import warnings +import numpy as np +from PIL import Image, ImageSequence +from moviepy.editor import VideoFileClip +import imageio +import uuid + +from diffusers import ( + TextToVideoSDPipeline, + AutoencoderKL, + DDPMScheduler, + DDIMScheduler, + UNet3DConditionModel, +) +import time +from transformers import CLIPTokenizer, CLIPTextModel + +from diffusers.utils import export_to_video +from gifs_filter import filter +from invert_utils import ddim_inversion as dd_inversion +from text2vid_modded_full import TextToVideoSDPipelineModded + +# Flask app setup +app = Flask(__name__) +app.config['UPLOAD_FOLDER'] = 'static/uploads' +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size +os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) + +# Environment setup +os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1" +LORA_CHECKPOINT = "checkpoint-2500" +device = 'cuda' if torch.cuda.is_available() else 'cpu' +dtype = torch.bfloat16 + +# Helper functions + +def cleanup_old_files(directory, age_in_seconds = 600): + """ + Deletes files older than a certain age in the specified directory. + + Args: + directory (str): The directory to clean up. + age_in_seconds (int): The age in seconds; files older than this will be deleted. + """ + now = time.time() + for filename in os.listdir(directory): + file_path = os.path.join(directory, filename) + # Only delete files (not directories) + if os.path.isfile(file_path): + file_age = now - os.path.getmtime(file_path) + if file_age > age_in_seconds: + try: + os.remove(file_path) + print(f"Deleted old file: {file_path}") + except Exception as e: + print(f"Error deleting file {file_path}: {e}") + +def load_frames(image: Image, mode='RGBA'): + return np.array([np.array(frame.convert(mode)) for frame in ImageSequence.Iterator(image)]) + +def save_gif(frames, path): + imageio.mimsave(path, [frame.astype(np.uint8) for frame in frames], format='GIF', duration=1/10) + +def load_image(imgname, target_size=None): + pil_img = Image.open(imgname).convert('RGB') + if target_size: + if isinstance(target_size, int): + target_size = (target_size, target_size) + pil_img = pil_img.resize(target_size, Image.Resampling.LANCZOS) + return torchvision.transforms.ToTensor()(pil_img).unsqueeze(0) # Add batch dimension + +def prepare_latents(pipe, x_aug): + with torch.cuda.amp.autocast(): + batch_size, num_frames, channels, height, width = x_aug.shape + x_aug = x_aug.reshape(batch_size * num_frames, channels, height, width) + latents = pipe.vae.encode(x_aug).latent_dist.sample() + latents = latents.view(batch_size, num_frames, -1, latents.shape[2], latents.shape[3]) + latents = latents.permute(0, 2, 1, 3, 4) + return pipe.vae.config.scaling_factor * latents + +@torch.no_grad() +def invert(pipe, inv, load_name, device="cuda", dtype=torch.bfloat16): + input_img = [load_image(load_name, 256).to(device, dtype=dtype).unsqueeze(1)] * 5 + input_img = torch.cat(input_img, dim=1) + latents = prepare_latents(pipe, input_img).to(torch.bfloat16) + inv.set_timesteps(25) + id_latents = dd_inversion(pipe, inv, video_latent=latents, num_inv_steps=25, prompt="")[-1].to(dtype) + return torch.mean(id_latents, dim=2, keepdim=True) + +def load_primary_models(pretrained_model_path): + return ( + DDPMScheduler.from_config(pretrained_model_path, subfolder="scheduler"), + CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer"), + CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder"), + AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae"), + UNet3DConditionModel.from_pretrained(pretrained_model_path, subfolder="unet"), + ) + + +def initialize_pipeline(model: str, device: str = "cuda"): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + scheduler, tokenizer, text_encoder, vae, unet = load_primary_models(model) + pipe = TextToVideoSDPipeline.from_pretrained( + pretrained_model_name_or_path="damo-vilab/text-to-video-ms-1.7b", + scheduler=scheduler, + tokenizer=tokenizer, + text_encoder=text_encoder.to(device=device, dtype=torch.bfloat16), + vae=vae.to(device=device, dtype=torch.bfloat16), + unet=unet.to(device=device, dtype=torch.bfloat16), + ) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + return pipe, pipe.scheduler + +pipe_inversion, inv = initialize_pipeline(LORA_CHECKPOINT, device) +pipe = TextToVideoSDPipelineModded.from_pretrained( + pretrained_model_name_or_path="damo-vilab/text-to-video-ms-1.7b", + scheduler=pipe_inversion.scheduler, + tokenizer=pipe_inversion.tokenizer, + text_encoder=pipe_inversion.text_encoder, + vae=pipe_inversion.vae, + unet=pipe_inversion.unet, +).to(device) + +@torch.no_grad() +def process(num_frames, num_seeds, generator, exp_dir, load_name, caption, lambda_): + pipe_inversion.to(device) + id_latents = invert(pipe_inversion, inv, load_name).to(device, dtype=dtype) + latents = id_latents.repeat(num_seeds, 1, 1, 1, 1) + generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(num_seeds)] + video_frames = pipe( + prompt=caption, + negative_prompt="", + num_frames=num_frames, + num_inference_steps=25, + inv_latents=latents, + guidance_scale=9, + generator=generator, + lambda_=lambda_, + ).frames + try: + load_name = load_name.split("/")[-1] + except: + pass + gifs = [] + for seed in range(num_seeds): + vid_name = f"{exp_dir}/mp4_logs/vid_{load_name[:-4]}-rand{seed}.mp4" + gif_name = f"{exp_dir}/gif_logs/vid_{load_name[:-4]}-rand{seed}.gif" + video_path = export_to_video(video_frames[seed], output_video_path=vid_name) + VideoFileClip(vid_name).write_gif(gif_name) + with Image.open(gif_name) as im: + frames = load_frames(im) + + frames_collect = np.empty((0, 1024, 1024), int) + for frame in frames: + frame = cv2.resize(frame, (1024, 1024))[:, :, :3] + frame = cv2.cvtColor(255 - frame, cv2.COLOR_RGB2GRAY) + + _, frame = cv2.threshold(255 - frame, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + frames_collect = np.append(frames_collect, [frame], axis=0) + + save_gif(frames_collect, gif_name) + gifs.append(gif_name) + + return gifs + + +def generate_gifs(filepath, prompt, num_seeds=5, lambda_=0): + exp_dir = "static/app_tmp" + os.makedirs(exp_dir, exist_ok=True) + gifs = process( + num_frames=10, + num_seeds=num_seeds, + generator=None, + exp_dir=exp_dir, + load_name=filepath, + caption=prompt, + lambda_=lambda_ + ) + return gifs + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/generate', methods=['POST']) +def generate(): + + directories_to_clean = [ + app.config['UPLOAD_FOLDER'], + 'static/app_tmp/mp4_logs', + 'static/app_tmp/gif_logs', + 'static/app_tmp/png_logs' + ] + + # Perform cleanup + os.makedirs('static/app_tmp', exist_ok=True) + for directory in directories_to_clean: + os.makedirs(directory, exist_ok=True) # Ensure the directory exists + cleanup_old_files(directory) + + prompt = request.form.get('prompt', '') + num_gifs = int(request.form.get('seeds', 3)) + lambda_value = 1 - float(request.form.get('lambda', 0.5)) + selected_example = request.form.get('selected_example', None) + file = request.files.get('image') + + if not file and not selected_example: + return jsonify({'error': 'No image file provided or example selected'}), 400 + + if selected_example: + # Use the selected example image + filepath = os.path.join('static', 'examples', selected_example) + unique_id = None # No need for unique ID + else: + # Save the uploaded image + unique_id = str(uuid.uuid4()) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], f"{unique_id}_uploaded_image.png") + file.save(filepath) + + generated_gifs = generate_gifs(filepath, prompt, num_seeds=num_gifs, lambda_=lambda_value) + + unique_id = str(uuid.uuid4()) + # Append unique id to each gif path + for i in range(len(generated_gifs)): + os.rename(generated_gifs[i], f"{generated_gifs[i].split('.')[0]}_{unique_id}.gif") + generated_gifs[i] = f"{generated_gifs[i].split('.')[0]}_{unique_id}.gif" + # Move the generated gifs to the static folder + + + filtered_gifs = filter(generated_gifs, filepath) + return jsonify({'gifs': filtered_gifs, 'prompt': prompt}) + +if __name__ == '__main__': + + + app.run(debug=True) \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..1bc67341b2e9a0f485ccbb57b0932591267aa2c9 --- /dev/null +++ b/environment.yml @@ -0,0 +1,402 @@ +name: flipsketch +channels: + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - asttokens=2.4.1=pyhd8ed1ab_0 + - blas=1.0=mkl + - brotli-python=1.0.9=py310hd8f1fbe_7 + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2024.2.2=hbcca054_0 + - certifi=2024.2.2=pyhd8ed1ab_0 + - charset-normalizer=2.0.4=pyhd8ed1ab_0 + - comm=0.2.2=pyhd8ed1ab_0 + - cuda=11.6.1=0 + - cuda-cccl=11.6.55=hf6102b2_0 + - cuda-command-line-tools=11.6.2=0 + - cuda-compiler=11.6.2=0 + - cuda-cudart=11.6.55=he381448_0 + - cuda-cudart-dev=11.6.55=h42ad0f4_0 + - cuda-cuobjdump=11.6.124=h2eeebcb_0 + - cuda-cupti=11.6.124=h86345e5_0 + - cuda-cuxxfilt=11.6.124=hecbf4f6_0 + - cuda-driver-dev=11.6.55=0 + - cuda-gdb=12.4.127=0 + - cuda-libraries=11.6.1=0 + - cuda-libraries-dev=11.6.1=0 + - cuda-memcheck=11.8.86=0 + - cuda-nsight=12.4.127=0 + - cuda-nsight-compute=12.4.1=0 + - cuda-nvcc=11.6.124=hbba6d2d_0 + - cuda-nvdisasm=12.4.127=0 + - cuda-nvml-dev=11.6.55=haa9ef22_0 + - cuda-nvprof=12.4.127=0 + - cuda-nvprune=11.6.124=he22ec0a_0 + - cuda-nvrtc=11.6.124=h020bade_0 + - cuda-nvrtc-dev=11.6.124=h249d397_0 + - cuda-nvtx=11.6.124=h0630a44_0 + - cuda-nvvp=12.4.127=0 + - cuda-runtime=11.6.1=0 + - cuda-samples=11.6.101=h8efea70_0 + - cuda-sanitizer-api=12.4.127=0 + - cuda-toolkit=11.6.1=0 + - cuda-tools=11.6.1=0 + - cuda-visual-tools=11.6.1=0 + - debugpy=1.6.7=py310h6a678d5_0 + - entrypoints=0.4=pyhd8ed1ab_0 + - exceptiongroup=1.2.0=pyhd8ed1ab_2 + - executing=2.0.1=pyhd8ed1ab_0 + - ffmpeg=4.3=hf484d3e_0 + - freetype=2.12.1=h4a9f257_0 + - gds-tools=1.9.1.3=0 + - gmp=6.2.1=h58526e2_0 + - gnutls=3.6.15=he1e5248_0 + - idna=3.4=pyhd8ed1ab_0 + - intel-openmp=2023.1.0=hdb19cb5_46306 + - ipykernel=6.29.3=pyhd33586a_0 + - jedi=0.19.1=pyhd8ed1ab_0 + - jpeg=9e=h166bdaf_1 + - jupyter_client=7.3.4=pyhd8ed1ab_0 + - jupyter_core=5.7.2=pyh31011fe_1 + - lame=3.100=h7f98852_1001 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h9c3ff4c_0 + - libcublas=11.9.2.110=h5e84587_0 + - libcublas-dev=11.9.2.110=h5c901ab_0 + - libcufft=10.7.1.112=hf425ae0_0 + - libcufft-dev=10.7.1.112=ha5ce4c0_0 + - libcufile=1.9.1.3=0 + - libcufile-dev=1.9.1.3=0 + - libcurand=10.3.5.147=0 + - libcurand-dev=10.3.5.147=0 + - libcusolver=11.3.4.124=h33c3c4e_0 + - libcusparse=11.7.2.124=h7538f96_0 + - libcusparse-dev=11.7.2.124=hbbe9722_0 + - libdeflate=1.17=h5eee18b_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h516909a_0 + - libidn2=2.3.4=h5eee18b_0 + - libnpp=11.6.3.124=hd2722f0_0 + - libnpp-dev=11.6.3.124=h3c42840_0 + - libnvjpeg=11.6.2.124=hd473ad6_0 + - libnvjpeg-dev=11.6.2.124=hb5906b9_0 + - libpng=1.6.39=h5eee18b_0 + - libsodium=1.0.18=h36c2ea0_1 + - libstdcxx-ng=11.2.0=he4da1e4_16 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.5.1=h6a678d5_0 + - libunistring=0.9.10=h7f98852_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp-base=1.3.2=h5eee18b_0 + - lz4-c=1.9.4=h6a678d5_1 + - mkl=2023.1.0=h213fc3f_46344 + - mkl-service=2.4.0=py310h5eee18b_1 + - mkl_fft=1.3.8=py310h5eee18b_0 + - mkl_random=1.2.4=py310hdb19cb5_0 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.6.0=pyhd8ed1ab_0 + - nettle=3.7.3=hbbd107a_1 + - nsight-compute=2024.1.1.4=0 + - numpy-base=1.26.4=py310hb5e798b_0 + - openh264=2.1.1=h780b84a_0 + - openjpeg=2.4.0=h9ca470c_2 + - openssl=3.0.13=h7f8727e_2 + - packaging=24.0=pyhd8ed1ab_0 + - parso=0.8.4=pyhd8ed1ab_0 + - pexpect=4.9.0=pyhd8ed1ab_0 + - pickleshare=0.7.5=py_1003 + - pip=23.3.1=pyhd8ed1ab_0 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pure_eval=0.2.2=pyhd8ed1ab_0 + - pygments=2.17.2=pyhd8ed1ab_0 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.10.14=h955ad1f_0 + - python_abi=3.10=2_cp310 + - pytorch=1.13.1=py3.10_cuda11.6_cudnn8.3.2_0 + - pytorch-cuda=11.6=h867d48c_1 + - pytorch-mutex=1.0=cuda + - pyzmq=25.1.2=py310h6a678d5_0 + - readline=8.2=h5eee18b_0 + - requests=2.31.0=pyhd8ed1ab_0 + - setuptools=68.2.2=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - sqlite=3.41.2=h5eee18b_0 + - tbb=2021.8.0=hdb19cb5_0 + - tk=8.6.12=h1ccaba5_0 + - torchaudio=0.13.1=py310_cu116 + - tornado=6.1=py310h5764c6d_3 + - typing_extensions=4.9.0=pyha770c72_0 + - tzdata=2024a=h8827d51_1 + - urllib3=2.1.0=pyhd8ed1ab_0 + - wcwidth=0.2.13=pyhd8ed1ab_0 + - wheel=0.41.2=pyhd8ed1ab_0 + - xz=5.4.6=h5eee18b_1 + - zeromq=4.3.5=h6a678d5_0 + - zlib=1.2.13=h5eee18b_1 + - zstd=1.5.5=hc292b87_2 + - pip: + - absl-py==2.1.0 + - accelerate==0.29.2 + - addict==2.4.0 + - aiofiles==23.2.1 + - aiohttp==3.9.3 + - aiosignal==1.3.1 + - albumentations==1.3.0 + - aliyun-python-sdk-core==2.15.1 + - aliyun-python-sdk-kms==2.16.2 + - annotated-types==0.7.0 + - antlr4-python3-runtime==4.8 + - anyio==4.6.2.post1 + - appdirs==1.4.4 + - async-timeout==4.0.3 + - attrs==23.2.0 + - basicsr==1.4.2 + - beautifulsoup4==4.12.3 + - bitsandbytes==0.35.4 + - black==21.4b2 + - blinker==1.8.2 + - blis==0.7.11 + - boto3==1.34.97 + - botocore==1.34.97 + - bresenham==0.2.1 + - cachetools==5.3.3 + - captum==0.7.0 + - catalogue==2.0.10 + - cffi==1.16.0 + - chardet==5.2.0 + - click==8.1.7 + - clip==0.1.0 + - cloudpickle==3.0.0 + - cmake==3.25.2 + - compel==2.0.3 + - confection==0.1.4 + - contourpy==1.2.1 + - controlnet-aux==0.0.6 + - crcmod==1.7 + - cryptography==42.0.7 + - cssselect2==0.7.0 + - cycler==0.12.1 + - cymem==2.0.8 + - cython==3.0.10 + - datasets==2.18.0 + - decorator==4.4.2 + - decord==0.6.0 + - deepspeed==0.8.0 + - diffdist==0.1 + - diffusers==0.27.2 + - dill==0.3.8 + - docker-pycreds==0.4.0 + - easydict==1.10 + - einops==0.3.0 + - fairscale==0.4.13 + - faiss-cpu==1.8.0 + - fastapi==0.115.4 + - ffmpy==0.3.0 + - filelock==3.13.4 + - flask==3.0.3 + - flatbuffers==24.3.25 + - fonttools==4.51.0 + - frozenlist==1.4.1 + - fsspec==2024.2.0 + - ftfy==6.1.1 + - future==1.0.0 + - fvcore==0.1.5.post20221221 + - gast==0.5.4 + - gdown==5.1.0 + - gitdb==4.0.11 + - gitpython==3.1.43 + - google-auth==2.29.0 + - google-auth-oauthlib==0.4.6 + - gradio==5.5.0 + - gradio-client==1.4.2 + - grpcio==1.62.1 + - h11==0.14.0 + - hjson==3.1.0 + - httpcore==1.0.6 + - httpx==0.27.2 + - huggingface-hub==0.25.2 + - hydra-core==1.1.1 + - imageio==2.25.1 + - imageio-ffmpeg==0.4.8 + - importlib-metadata==7.1.0 + - inquirerpy==0.3.4 + - iopath==0.1.9 + - ipdb==0.13.13 + - ipympl==0.9.4 + - ipython==8.23.0 + - ipython-genutils==0.2.0 + - ipywidgets==8.1.2 + - itsdangerous==2.2.0 + - jax==0.4.26 + - jaxlib==0.4.26 + - jinja2==3.1.3 + - jmespath==0.10.0 + - joblib==1.4.2 + - jupyterlab-widgets==3.0.10 + - kiwisolver==1.4.5 + - kornia==0.6.0 + - lightning-utilities==0.11.2 + - lmdb==1.4.1 + - loguru==0.7.2 + - loralib==0.1.2 + - lvis==0.5.3 + - lxml==5.2.1 + - markdown==3.6 + - markdown-it-py==3.0.0 + - markupsafe==2.1.5 + - matplotlib==3.8.4 + - matplotlib-inline==0.1.6 + - mdurl==0.1.2 + - mediapipe==0.10.11 + - ml-dtypes==0.4.0 + - modelcards==0.1.6 + - modelscope==1.14.0 + - motion-vector-extractor==1.0.6 + - moviepy==1.0.3 + - mpmath==1.3.0 + - multidict==6.0.5 + - multiprocess==0.70.16 + - murmurhash==1.0.10 + - mypy-extensions==1.0.0 + - networkx==3.3 + - ninja==1.11.1.1 + - nltk==3.8.1 + - numpy==1.24.2 + - nvidia-cublas-cu11==11.10.3.66 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu11==11.7.99 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu11==11.7.99 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu11==8.5.0.96 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-nccl-cu12==2.20.5 + - nvidia-nvjitlink-cu12==12.6.77 + - nvidia-nvtx-cu12==12.1.105 + - oauthlib==3.2.2 + - omegaconf==2.1.1 + - open-clip-torch==2.0.2 + - opencv-contrib-python==4.9.0.80 + - opencv-python==4.6.0.66 + - opencv-python-headless==4.9.0.80 + - opt-einsum==3.3.0 + - orjson==3.10.11 + - oss2==2.18.5 + - pandas==1.5.3 + - pathspec==0.12.1 + - pathtools==0.1.2 + - peft==0.10.0 + - pfzy==0.3.4 + - pillow==9.5.0 + - pkgconfig==1.5.5 + - platformdirs==4.2.0 + - portalocker==2.8.2 + - preshed==3.0.9 + - proglog==0.1.10 + - prompt-toolkit==3.0.43 + - protobuf==3.20.3 + - psutil==5.9.8 + - py-cpuinfo==9.0.0 + - pyarrow==15.0.2 + - pyarrow-hotfix==0.6 + - pyasn1==0.6.0 + - pyasn1-modules==0.4.0 + - pyav==12.0.5 + - pycocotools==2.0.7 + - pycparser==2.22 + - pycryptodome==3.20.0 + - pydantic==2.9.2 + - pydantic-core==2.23.4 + - pydeprecate==0.3.1 + - pydot==2.0.0 + - pydub==0.25.1 + - pynvml==11.5.3 + - pyparsing==3.1.2 + - pyre-extensions==0.0.23 + - python-dateutil==2.9.0.post0 + - python-multipart==0.0.12 + - pytorch-lightning==1.4.2 + - pytz==2024.1 + - pywavelets==1.6.0 + - pyyaml==6.0.1 + - qudida==0.0.4 + - regex==2024.4.16 + - reportlab==4.1.0 + - requests-oauthlib==2.0.0 + - rich==13.9.4 + - rsa==4.9 + - ruff==0.7.2 + - s3transfer==0.10.1 + - safehttpx==0.1.1 + - safetensors==0.4.2 + - scikit-image==0.19.3 + - scikit-learn==1.4.2 + - scikit-video==1.1.11 + - scipy==1.10.1 + - semantic-version==2.10.0 + - sentry-sdk==1.44.1 + - setproctitle==1.3.3 + - shapely==2.0.3 + - shellingham==1.5.4 + - simplejson==3.19.2 + - smmap==5.0.1 + - sniffio==1.3.1 + - sortedcontainers==2.4.0 + - sounddevice==0.4.6 + - soupsieve==2.5 + - srsly==2.4.8 + - stable-diffusion-sdkit==2.1.3 + - stack-data==0.6.3 + - starlette==0.41.2 + - svg-path==6.3 + - svglib==1.5.1 + - svgpathtools==1.6.1 + - svgwrite==1.4.3 + - sympy==1.13.3 + - tabulate==0.9.0 + - tb-nightly==2.17.0a20240408 + - tensorboard==2.12.0 + - tensorboard-data-server==0.7.0 + - tensorboard-plugin-wit==1.8.1 + - termcolor==2.2.0 + - test-tube==0.7.5 + - thinc==8.1.10 + - threadpoolctl==3.5.0 + - tifffile==2024.2.12 + - timm==0.6.11 + - tinycss2==1.2.1 + - tokenizers==0.20.1 + - toml==0.10.2 + - tomli==2.0.1 + - tomlkit==0.12.0 + - torch==1.13.1 + - torchmetrics==0.6.0 + - torchsummary==1.5.1 + - torchvision==0.14.1 + - tqdm==4.64.1 + - traitlets==5.14.2 + - transformers==4.45.2 + - triton==2.3.0 + - typer==0.12.5 + - typing-inspect==0.9.0 + - uvicorn==0.32.0 + - wandb==0.16.6 + - wasabi==1.1.2 + - webencodings==0.5.1 + - websockets==12.0 + - werkzeug==3.0.2 + - widgetsnbextension==4.0.10 + - xformers==0.0.16 + - xxhash==3.4.1 + - yacs==0.1.8 + - yapf==0.40.2 + - yarl==1.9.4 + - zipp==3.18.1 diff --git a/gifs_filter.py b/gifs_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..ecaa366d0b1c163e893682300081d8a6d4960ff1 --- /dev/null +++ b/gifs_filter.py @@ -0,0 +1,68 @@ +# filter images +from PIL import Image, ImageSequence +import requests +from tqdm import tqdm +import numpy as np +import torch +from transformers import CLIPProcessor, CLIPModel + +def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + converted_len = int(clip_len * frame_sample_rate) + end_idx = np.random.randint(converted_len, seg_len) + start_idx = end_idx - converted_len + indices = np.linspace(start_idx, end_idx, num=clip_len) + indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + return indices + +def load_frames(image: Image, mode='RGBA'): + return np.array([ + np.array(frame.convert(mode)) + for frame in ImageSequence.Iterator(image) + ]) + +img_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") +img_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + + +def filter(gifs, input_image): + max_cosine = 0.9 + max_gif = [] + + for gif in tqdm(gifs, total=len(gifs)): + with Image.open(gif) as im: + frames = load_frames(im) + + frames = np.array(frames) + frames = frames[:, :, :, :3] + frames = np.transpose(frames, (0, 3, 1, 2))[1:] + + + + image = Image.open(input_image) + + + inputs = img_processor(images=frames, return_tensors="pt", padding=False) + inputs_base = img_processor(images=image, return_tensors="pt", padding=False) + + with torch.no_grad(): + feat_img_base = img_model.get_image_features(pixel_values=inputs_base["pixel_values"]) + feat_img_vid = img_model.get_image_features(pixel_values=inputs["pixel_values"]) + cos_avg = 0 + avg_score_for_vid = 0 + for i in range(len(feat_img_vid)): + + cosine_similarity = torch.nn.functional.cosine_similarity( + feat_img_base, + feat_img_vid[0].unsqueeze(0), + dim=1) + # print(cosine_similarity) + cos_avg += cosine_similarity.item() + + cos_avg /= len(feat_img_vid) + print("Current cosine similarity: ", cos_avg) + print("Max cosine similarity: ", max_cosine) + if cos_avg > max_cosine: + # max_cosine = cos_avg + max_gif.append(gif) + return max_gif \ No newline at end of file diff --git a/invert_utils.py b/invert_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..802fb8d001c7cc6f24afd07ceff7e2db3f039e5b --- /dev/null +++ b/invert_utils.py @@ -0,0 +1,89 @@ +import os +import imageio +import numpy as np +from typing import Union + +import torch +import torchvision + +from tqdm import tqdm +from einops import rearrange + + +def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=4, fps=8): + videos = rearrange(videos, "b c t h w -> t b c h w") + outputs = [] + for x in videos: + x = torchvision.utils.make_grid(x, nrow=n_rows) + x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) + if rescale: + x = (x + 1.0) / 2.0 # -1,1 -> 0,1 + x = (x * 255).numpy().astype(np.uint8) + outputs.append(x) + + os.makedirs(os.path.dirname(path), exist_ok=True) + imageio.mimsave(path, outputs, fps=fps) + + +# DDIM Inversion +@torch.no_grad() +def init_prompt(prompt, pipeline): + uncond_input = pipeline.tokenizer( + [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length, + return_tensors="pt" + ) + uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0] + text_input = pipeline.tokenizer( + [prompt], + padding="max_length", + max_length=pipeline.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0] + context = torch.cat([uncond_embeddings, text_embeddings]) + + return context + + +def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, + sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler): + timestep, next_timestep = min( + timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep + # try: + alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod + # except: + # alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] #if timestep >= 0 else ddim_scheduler.final_alpha_cumprod + alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep] + beta_prod_t = 1 - alpha_prod_t + next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5 + next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output + next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction + return next_sample + + +def get_noise_pred_single(latents, t, context, unet): + noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"] + return noise_pred + + +@torch.no_grad() +def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt): + context = init_prompt(prompt, pipeline) + uncond_embeddings, cond_embeddings = context.chunk(2) + all_latent = [latent] + latent = latent.clone().detach() + for i in tqdm(range(num_inv_steps)): + t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1] + noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet) + noise_pred_unc = get_noise_pred_single(latent, t, uncond_embeddings, pipeline.unet) + noise_pred = noise_pred_unc + 9.0 * (noise_pred_unc - noise_pred) + latent = next_step(noise_pred, t, latent, ddim_scheduler) + all_latent.append(latent) + return all_latent + + +@torch.no_grad() +def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""): + ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt) + return ddim_latents \ No newline at end of file diff --git a/read_vids.py b/read_vids.py new file mode 100644 index 0000000000000000000000000000000000000000..e9b0932e5d0a30dddd266415f9217e50b164fc23 --- /dev/null +++ b/read_vids.py @@ -0,0 +1,27 @@ +import imageio.v3 as iio +import os +from sys import argv +video_name = argv[1] + +video = video_name +video_id = video.split("/")[-1].replace(".mp4","") + + +png_base = "png_logs" +try: + os.mkdir(png_base) +except: + pass + +video_id = os.path.join(png_base, video_id) +all_frames = list(iio.imiter(video)) + +ctr = 0 +try: + os.makedirs(video_id) +except: + pass +for idx, frame in enumerate(all_frames): + + iio.imwrite(f"{video_id}/{ctr:03d}.jpg", frame) + ctr += 1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..75a1f342375829525f9a979a3849fd3caf40e9f2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,44 @@ +accelerate==0.29.2 +blinker==1.9.0 +certifi==2024.8.30 +charset-normalizer==3.4.0 +click==8.1.7 +decorator==4.4.2 +diffusers==0.27.2 +einops==0.8.0 +filelock==3.16.1 +Flask==3.0.3 +fsspec==2024.10.0 +huggingface-hub==0.25.2 +idna==3.10 +imageio==2.36.0 +imageio-ffmpeg==0.5.1 +importlib_metadata==8.5.0 +itsdangerous==2.2.0 +Jinja2==3.1.4 +MarkupSafe==3.0.2 +moviepy==1.0.3 +numpy==1.24.2 +nvidia-cublas-cu11==11.10.3.66 +nvidia-cuda-nvrtc-cu11==11.7.99 +nvidia-cuda-runtime-cu11==11.7.99 +nvidia-cudnn-cu11==8.5.0.96 +opencv-python==4.10.0.84 +packaging==24.2 +pillow==10.4.0 +proglog==0.1.10 +psutil==6.1.0 +python-dotenv==1.0.1 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +safetensors==0.4.5 +tokenizers==0.20.3 +torch==1.13.1 +torchvision==0.14.1 +tqdm==4.67.0 +transformers==4.45.2 +typing_extensions==4.12.2 +urllib3==2.2.3 +Werkzeug==3.1.3 +zipp==3.21.0 diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand0_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif b/static/app_tmp/gif_logs/vid_sketch10-rand0_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif new file mode 100644 index 0000000000000000000000000000000000000000..11b5a4f4d65bd6071ff8185edf8d934f9a5c29c6 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand0_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand0_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand0_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..79b36dd8ac1ebdcd43251f6ca2e5460156965d54 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand0_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand0_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif b/static/app_tmp/gif_logs/vid_sketch10-rand0_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif new file mode 100644 index 0000000000000000000000000000000000000000..53dd1632b4d9fa2ca068924cdd085cf47e14b241 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand0_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand0_dfcba486-0d8c-4d68-9689-97f1fb889213.gif b/static/app_tmp/gif_logs/vid_sketch10-rand0_dfcba486-0d8c-4d68-9689-97f1fb889213.gif new file mode 100644 index 0000000000000000000000000000000000000000..6290344646e5d5920a4f5ab3e8553ec7af83df11 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand0_dfcba486-0d8c-4d68-9689-97f1fb889213.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand1_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif b/static/app_tmp/gif_logs/vid_sketch10-rand1_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif new file mode 100644 index 0000000000000000000000000000000000000000..bcd6e1f1dc508a0c873c2914fadc137f18d7a2e2 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand1_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand1_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand1_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..79e6156e8a9f975606a9f8a8954ec82d830d2be9 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand1_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand1_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif b/static/app_tmp/gif_logs/vid_sketch10-rand1_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif new file mode 100644 index 0000000000000000000000000000000000000000..d3538b7753561da3fef372dfc29c177551719679 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand1_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand1_dfcba486-0d8c-4d68-9689-97f1fb889213.gif b/static/app_tmp/gif_logs/vid_sketch10-rand1_dfcba486-0d8c-4d68-9689-97f1fb889213.gif new file mode 100644 index 0000000000000000000000000000000000000000..495a75fc1e24756868e6306c7aa3e1aa2fcc3c61 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand1_dfcba486-0d8c-4d68-9689-97f1fb889213.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand2_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif b/static/app_tmp/gif_logs/vid_sketch10-rand2_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif new file mode 100644 index 0000000000000000000000000000000000000000..90134b819d8a93ea6ac64371ff7312f83fc82943 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand2_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand2_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand2_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..d1e921c261df1568cc7473c6c69fb7736c845030 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand2_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand2_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif b/static/app_tmp/gif_logs/vid_sketch10-rand2_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif new file mode 100644 index 0000000000000000000000000000000000000000..3f475f631334381e2f7f52818c5c492ccf06fc8a Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand2_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand2_dfcba486-0d8c-4d68-9689-97f1fb889213.gif b/static/app_tmp/gif_logs/vid_sketch10-rand2_dfcba486-0d8c-4d68-9689-97f1fb889213.gif new file mode 100644 index 0000000000000000000000000000000000000000..a43584a102c91c401d538df0082034d3d813321d Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand2_dfcba486-0d8c-4d68-9689-97f1fb889213.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand3_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif b/static/app_tmp/gif_logs/vid_sketch10-rand3_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif new file mode 100644 index 0000000000000000000000000000000000000000..064c9d832886d0259b5b8e77eb5d835c0ea007c9 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand3_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand3_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand3_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..512225ba16a81c83d5d2b2b33f85041341e522a6 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand3_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand3_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif b/static/app_tmp/gif_logs/vid_sketch10-rand3_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif new file mode 100644 index 0000000000000000000000000000000000000000..fbdb3ef671e488b34d1e488722dcc97a4b9e8d77 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand3_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand3_dfcba486-0d8c-4d68-9689-97f1fb889213.gif b/static/app_tmp/gif_logs/vid_sketch10-rand3_dfcba486-0d8c-4d68-9689-97f1fb889213.gif new file mode 100644 index 0000000000000000000000000000000000000000..ced90e8e998918a1f39bebffef964c10f2196596 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand3_dfcba486-0d8c-4d68-9689-97f1fb889213.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand4_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif b/static/app_tmp/gif_logs/vid_sketch10-rand4_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif new file mode 100644 index 0000000000000000000000000000000000000000..5b4045f63f78191377749364036ba9276cb8cb0d Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand4_4e766a8e-9d22-4818-8991-e884ce17e5e5.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand4_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand4_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..124ab53ce653dd1f6062df8a80bea644ab7c7224 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand4_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand4_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif b/static/app_tmp/gif_logs/vid_sketch10-rand4_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif new file mode 100644 index 0000000000000000000000000000000000000000..209c09fb2468c51fcbb210ab56df2071607dffe3 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand4_9e7e07af-2adc-47b0-8aa4-716a934690e8.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand4_dfcba486-0d8c-4d68-9689-97f1fb889213.gif b/static/app_tmp/gif_logs/vid_sketch10-rand4_dfcba486-0d8c-4d68-9689-97f1fb889213.gif new file mode 100644 index 0000000000000000000000000000000000000000..df4fd44c56e174c943c7741f52352639fc6f657e Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand4_dfcba486-0d8c-4d68-9689-97f1fb889213.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand5_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand5_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..049cacdbddc16d194b073a9ce65035e4896861c3 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand5_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand6_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand6_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..8611fc3ca65fa76f7b972c33d12b4325a53d710b Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand6_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand7_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand7_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..7eda0e79cc3183ace45203a6bafb2ac4bd1c51a7 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand7_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand8_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand8_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..0d3fb01d18bd20598ce1b06cf7aea9deba07ae1d Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand8_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch10-rand9_508fa599-d685-462e-ad06-11ca4fd15d6f.gif b/static/app_tmp/gif_logs/vid_sketch10-rand9_508fa599-d685-462e-ad06-11ca4fd15d6f.gif new file mode 100644 index 0000000000000000000000000000000000000000..46ac2c831770b6b161f03f373009308e55d4e502 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch10-rand9_508fa599-d685-462e-ad06-11ca4fd15d6f.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch3-rand0_875203a1-f830-46e7-a287-4a0bc2c3a648.gif b/static/app_tmp/gif_logs/vid_sketch3-rand0_875203a1-f830-46e7-a287-4a0bc2c3a648.gif new file mode 100644 index 0000000000000000000000000000000000000000..ca0b791e7bf7923bd909226372022594a64cdc2b Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch3-rand0_875203a1-f830-46e7-a287-4a0bc2c3a648.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch3-rand1_875203a1-f830-46e7-a287-4a0bc2c3a648.gif b/static/app_tmp/gif_logs/vid_sketch3-rand1_875203a1-f830-46e7-a287-4a0bc2c3a648.gif new file mode 100644 index 0000000000000000000000000000000000000000..d48e4fa3961ed8cd812c0fa743e29c7ff35c4661 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch3-rand1_875203a1-f830-46e7-a287-4a0bc2c3a648.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch3-rand2_875203a1-f830-46e7-a287-4a0bc2c3a648.gif b/static/app_tmp/gif_logs/vid_sketch3-rand2_875203a1-f830-46e7-a287-4a0bc2c3a648.gif new file mode 100644 index 0000000000000000000000000000000000000000..e4d59f3e73c0930db699e247d78c27c4811678e6 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch3-rand2_875203a1-f830-46e7-a287-4a0bc2c3a648.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch3-rand3_875203a1-f830-46e7-a287-4a0bc2c3a648.gif b/static/app_tmp/gif_logs/vid_sketch3-rand3_875203a1-f830-46e7-a287-4a0bc2c3a648.gif new file mode 100644 index 0000000000000000000000000000000000000000..abfb077d01dcc1e1356d41f94c7ccbafd6d7902d Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch3-rand3_875203a1-f830-46e7-a287-4a0bc2c3a648.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch3-rand4_875203a1-f830-46e7-a287-4a0bc2c3a648.gif b/static/app_tmp/gif_logs/vid_sketch3-rand4_875203a1-f830-46e7-a287-4a0bc2c3a648.gif new file mode 100644 index 0000000000000000000000000000000000000000..be6b85bbfa89f568a99fa729307f71f29265505c Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch3-rand4_875203a1-f830-46e7-a287-4a0bc2c3a648.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand0_47fc0372-4688-4a2a-abb3-817ccfee8816.gif b/static/app_tmp/gif_logs/vid_sketch8-rand0_47fc0372-4688-4a2a-abb3-817ccfee8816.gif new file mode 100644 index 0000000000000000000000000000000000000000..e88a745d864806273987648011af3602c40458d9 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand0_47fc0372-4688-4a2a-abb3-817ccfee8816.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand0_77158110-9239-4771-bb44-a83c3aa47567.gif b/static/app_tmp/gif_logs/vid_sketch8-rand0_77158110-9239-4771-bb44-a83c3aa47567.gif new file mode 100644 index 0000000000000000000000000000000000000000..19ab20e3055681b3bdd5d3a404c16607d4698e2b Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand0_77158110-9239-4771-bb44-a83c3aa47567.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand0_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif b/static/app_tmp/gif_logs/vid_sketch8-rand0_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif new file mode 100644 index 0000000000000000000000000000000000000000..630b17e0470cdfda3c9a4e31d3ca4b60fbf9b939 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand0_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand1_47fc0372-4688-4a2a-abb3-817ccfee8816.gif b/static/app_tmp/gif_logs/vid_sketch8-rand1_47fc0372-4688-4a2a-abb3-817ccfee8816.gif new file mode 100644 index 0000000000000000000000000000000000000000..e4641d9b9ea06c027a8735df3b54ae43ef946ab2 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand1_47fc0372-4688-4a2a-abb3-817ccfee8816.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand1_77158110-9239-4771-bb44-a83c3aa47567.gif b/static/app_tmp/gif_logs/vid_sketch8-rand1_77158110-9239-4771-bb44-a83c3aa47567.gif new file mode 100644 index 0000000000000000000000000000000000000000..f99c40b39c67111bafd799cfd196259b07441d35 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand1_77158110-9239-4771-bb44-a83c3aa47567.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand1_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif b/static/app_tmp/gif_logs/vid_sketch8-rand1_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif new file mode 100644 index 0000000000000000000000000000000000000000..76f1c0ca11e8e90d685d27bc065cf6de68742936 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand1_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand2_47fc0372-4688-4a2a-abb3-817ccfee8816.gif b/static/app_tmp/gif_logs/vid_sketch8-rand2_47fc0372-4688-4a2a-abb3-817ccfee8816.gif new file mode 100644 index 0000000000000000000000000000000000000000..7fa2543f6499c7626369ff4959ffbfa788c8ad38 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand2_47fc0372-4688-4a2a-abb3-817ccfee8816.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand2_77158110-9239-4771-bb44-a83c3aa47567.gif b/static/app_tmp/gif_logs/vid_sketch8-rand2_77158110-9239-4771-bb44-a83c3aa47567.gif new file mode 100644 index 0000000000000000000000000000000000000000..a5e5a98ebb1665b9f5e387d92e47e326fcc0c137 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand2_77158110-9239-4771-bb44-a83c3aa47567.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand2_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif b/static/app_tmp/gif_logs/vid_sketch8-rand2_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif new file mode 100644 index 0000000000000000000000000000000000000000..d45ab522052ee3362353568a019c7ac73ee18926 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand2_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand3_47fc0372-4688-4a2a-abb3-817ccfee8816.gif b/static/app_tmp/gif_logs/vid_sketch8-rand3_47fc0372-4688-4a2a-abb3-817ccfee8816.gif new file mode 100644 index 0000000000000000000000000000000000000000..cb9d52e01ab26f74d89b548a6824f5325a95fba4 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand3_47fc0372-4688-4a2a-abb3-817ccfee8816.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand3_77158110-9239-4771-bb44-a83c3aa47567.gif b/static/app_tmp/gif_logs/vid_sketch8-rand3_77158110-9239-4771-bb44-a83c3aa47567.gif new file mode 100644 index 0000000000000000000000000000000000000000..968cbc3d30813ee29bcdfa7e38d158a038b37278 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand3_77158110-9239-4771-bb44-a83c3aa47567.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand3_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif b/static/app_tmp/gif_logs/vid_sketch8-rand3_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif new file mode 100644 index 0000000000000000000000000000000000000000..145814d990e5d7b477510d2bb174391af7579e97 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand3_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand4_47fc0372-4688-4a2a-abb3-817ccfee8816.gif b/static/app_tmp/gif_logs/vid_sketch8-rand4_47fc0372-4688-4a2a-abb3-817ccfee8816.gif new file mode 100644 index 0000000000000000000000000000000000000000..758e0c617aedaf3e7e58d3a3f255348a7c535018 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand4_47fc0372-4688-4a2a-abb3-817ccfee8816.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand4_77158110-9239-4771-bb44-a83c3aa47567.gif b/static/app_tmp/gif_logs/vid_sketch8-rand4_77158110-9239-4771-bb44-a83c3aa47567.gif new file mode 100644 index 0000000000000000000000000000000000000000..eec80d0126c125ae9b437d4e7d2c024e446421b8 Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand4_77158110-9239-4771-bb44-a83c3aa47567.gif differ diff --git a/static/app_tmp/gif_logs/vid_sketch8-rand4_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif b/static/app_tmp/gif_logs/vid_sketch8-rand4_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif new file mode 100644 index 0000000000000000000000000000000000000000..5a2f2287a572959708be6f7850645ec58859e38f Binary files /dev/null and b/static/app_tmp/gif_logs/vid_sketch8-rand4_fd1dace5-80a2-4a0f-afb1-c6aa0943c91a.gif differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand0.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3702002fd69fa455ba85f4ddf8d3d003f62c1a87 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand0.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand1.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5808fde6dde0f637b2e7acde0f6da0ae3a669099 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand1.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand2.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..18e43eae9778bb817d2fe6a088a99e7359a6b8a5 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand2.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand3.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0c7e2b924057598fb6fb3e10e487d95b92366fc2 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand3.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand4.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand4.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7a736b91d70771edb4cf817e97de52100fffa6a5 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand4.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand5.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand5.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..49b1c4740f693e67eaf1c6172795b6f6f1513959 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand5.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand6.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand6.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ced1bb951c8605e49cca3d1f17a0cd294fde8399 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand6.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand7.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand7.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..21a025e950da196d6a88cf56365b29a999e73ee3 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand7.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand8.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand8.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..03177283e87822772a4a9c0898b19af18985efcb Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand8.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch10-rand9.mp4 b/static/app_tmp/mp4_logs/vid_sketch10-rand9.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..12af41f14779303e7a42b6cf6626f9c13e57abc6 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch10-rand9.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch3-rand0.mp4 b/static/app_tmp/mp4_logs/vid_sketch3-rand0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9d58c72dad05428636a4591131ed380726cf4675 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch3-rand0.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch3-rand1.mp4 b/static/app_tmp/mp4_logs/vid_sketch3-rand1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..831e0d7f7eac6a0630d328a8e0165ee97fd9a068 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch3-rand1.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch3-rand2.mp4 b/static/app_tmp/mp4_logs/vid_sketch3-rand2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8cc33b5b713db72861cb93160288beeaa48cc213 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch3-rand2.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch3-rand3.mp4 b/static/app_tmp/mp4_logs/vid_sketch3-rand3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..2cc9252dfe718dc671db2332adf566cf7cb1faa8 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch3-rand3.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch3-rand4.mp4 b/static/app_tmp/mp4_logs/vid_sketch3-rand4.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1a109ffbad6f8322b8fa407d9a78dcbe8564519d Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch3-rand4.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch8-rand0.mp4 b/static/app_tmp/mp4_logs/vid_sketch8-rand0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e1668d3017cdd9b8681594bca5fe6f914620fcfb Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch8-rand0.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch8-rand1.mp4 b/static/app_tmp/mp4_logs/vid_sketch8-rand1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b0e553690fd227a5fccbb359d254850e6c08f7cd Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch8-rand1.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch8-rand2.mp4 b/static/app_tmp/mp4_logs/vid_sketch8-rand2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..29bc34c1477b6588e5f0fc6f96322cc68ed6dba5 Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch8-rand2.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch8-rand3.mp4 b/static/app_tmp/mp4_logs/vid_sketch8-rand3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b5ad35a48a2c2963a2952cfd5ffc7feb2f234dfc Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch8-rand3.mp4 differ diff --git a/static/app_tmp/mp4_logs/vid_sketch8-rand4.mp4 b/static/app_tmp/mp4_logs/vid_sketch8-rand4.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d619c6a310135d94799eb1dc3ee5a9bb75a942de Binary files /dev/null and b/static/app_tmp/mp4_logs/vid_sketch8-rand4.mp4 differ diff --git a/static/examples/sketch1.png b/static/examples/sketch1.png new file mode 100644 index 0000000000000000000000000000000000000000..0583c7283e9336ba24389966c5290d62efc1b235 Binary files /dev/null and b/static/examples/sketch1.png differ diff --git a/static/examples/sketch10.png b/static/examples/sketch10.png new file mode 100644 index 0000000000000000000000000000000000000000..daa561f185de7271ba3275d2260b3b89d42321bd Binary files /dev/null and b/static/examples/sketch10.png differ diff --git a/static/examples/sketch11.png b/static/examples/sketch11.png new file mode 100644 index 0000000000000000000000000000000000000000..9118593530197a56b46585edc4acf960ef0b2d9e Binary files /dev/null and b/static/examples/sketch11.png differ diff --git a/static/examples/sketch12.png b/static/examples/sketch12.png new file mode 100644 index 0000000000000000000000000000000000000000..b55361e2da7d5fed2e9c942580c0429c75436b56 Binary files /dev/null and b/static/examples/sketch12.png differ diff --git a/static/examples/sketch13.png b/static/examples/sketch13.png new file mode 100644 index 0000000000000000000000000000000000000000..e31425a331056ee54107dec2284307f2fe5bbbe4 Binary files /dev/null and b/static/examples/sketch13.png differ diff --git a/static/examples/sketch15.png b/static/examples/sketch15.png new file mode 100644 index 0000000000000000000000000000000000000000..b8045c964f617f5b1adc454f8d8ee13fc38cb4e4 Binary files /dev/null and b/static/examples/sketch15.png differ diff --git a/static/examples/sketch2.png b/static/examples/sketch2.png new file mode 100644 index 0000000000000000000000000000000000000000..e20cf884bf84a4b789dc4e344f6ec43c689a4c3e Binary files /dev/null and b/static/examples/sketch2.png differ diff --git a/static/examples/sketch3.png b/static/examples/sketch3.png new file mode 100644 index 0000000000000000000000000000000000000000..e53744ea85a3cb4ebd06279500fbe8aea85e040a Binary files /dev/null and b/static/examples/sketch3.png differ diff --git a/static/examples/sketch4.png b/static/examples/sketch4.png new file mode 100644 index 0000000000000000000000000000000000000000..825e2b96e01ef527df98ef753f1247c133d64f94 Binary files /dev/null and b/static/examples/sketch4.png differ diff --git a/static/examples/sketch5.png b/static/examples/sketch5.png new file mode 100644 index 0000000000000000000000000000000000000000..b21e77dd93c8be87fb60e8b05aea0e9044c36ff7 Binary files /dev/null and b/static/examples/sketch5.png differ diff --git a/static/examples/sketch6.png b/static/examples/sketch6.png new file mode 100644 index 0000000000000000000000000000000000000000..7372989ad7c68a51821afac78cc28bbea9c95816 Binary files /dev/null and b/static/examples/sketch6.png differ diff --git a/static/examples/sketch7.png b/static/examples/sketch7.png new file mode 100644 index 0000000000000000000000000000000000000000..380c8c34e7e913a1a56cecaeb72be8e0c0a8dec4 Binary files /dev/null and b/static/examples/sketch7.png differ diff --git a/static/examples/sketch8.png b/static/examples/sketch8.png new file mode 100644 index 0000000000000000000000000000000000000000..35b6a44f5a97738e45372e185950aa52f56d7da0 Binary files /dev/null and b/static/examples/sketch8.png differ diff --git a/static/examples/sketch9.png b/static/examples/sketch9.png new file mode 100644 index 0000000000000000000000000000000000000000..82484815779a1c7948a46e96c3efde8fe7b1fcd3 Binary files /dev/null and b/static/examples/sketch9.png differ diff --git a/text2vid_modded.py b/text2vid_modded.py new file mode 100644 index 0000000000000000000000000000000000000000..c450ed3650aa8f8c9e9bb2718c02894d0192fba3 --- /dev/null +++ b/text2vid_modded.py @@ -0,0 +1,612 @@ +from typing import Any, Callable, Dict, List, Optional, Union +import numpy as np +import torch +from transformers import CLIPTextModel, CLIPTokenizer +from diffusers.image_processor import VaeImageProcessor +from diffusers.models import AutoencoderKL, UNet3DConditionModel +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + logging, + replace_example_docstring) +from diffusers.pipelines.text_to_video_synthesis import TextToVideoSDPipelineOutput + + + +TAU_2 = 15 +TAU_1 = 10 + + +def init_attention_params(unet, num_frames, lambda_=None, bs=None): + + + for name, module in unet.named_modules(): + module_name = type(module).__name__ + if module_name == "Attention": + module.LAMBDA = lambda_ + module.bs = bs + module.num_frames = num_frames + module.last_attn_slice_weights = 1 + +def init_attention_func(unet): + # ORIGINAL SOURCE CODE: https://github.com/huggingface/diffusers/blob/91ddd2a25b848df0fa1262d4f1cd98c7ccb87750/src/diffusers/models/attention.py#L276 + # Updated source code: https://github.com/huggingface/diffusers/blob/50296739878f3e17b2d25d45ef626318b44440b9/src/diffusers/models/attention_processor.py#L571 + def get_attention_scores( + self, query, key, attention_mask = None): + r""" + Compute the attention scores. + + Args: + query (`torch.Tensor`): The query tensor. + key (`torch.Tensor`): The key tensor. + attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied. + + Returns: + `torch.Tensor`: The attention probabilities/scores. + """ + + q_old = query.clone() + k_old = key.clone() + + if self.use_last_attn_slice: + if self.last_attn_slice is not None: + query_list = self.last_attn_slice[0] + key_list = self.last_attn_slice[1] + + if query.shape[1] == self.num_frames and query.shape == key.shape: + + key1 = key.clone() + key1[:,:1,:key_list.shape[2]] = key_list[:,:1] + + if q_old.shape == k_old.shape and q_old.shape[1]!=self.num_frames: + + batch_dim = query_list.shape[0] // self.bs + all_dim = query.shape[0] // self.bs + for i in range(self.bs): + query[i*all_dim:(i*all_dim) + batch_dim,:query_list.shape[1],:query_list.shape[2]] = query_list[i*batch_dim:(i+1)*batch_dim] + + + dtype = query.dtype + if self.upcast_attention: + query = query.float() + key = key.float() + + + if attention_mask is None: + baddbmm_input = torch.empty( + query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device + ) + beta = 0 + else: + baddbmm_input = attention_mask + beta = 1 + + + attention_scores = torch.baddbmm( + baddbmm_input, + query, + key.transpose(-1, -2), + beta=beta, + alpha=self.scale, + ) + + if query.shape[1] == self.num_frames and query.shape == key.shape and self.use_last_attn_slice: + attention_scores1 = torch.baddbmm( + baddbmm_input, + query, + key1.transpose(-1, -2), + beta=beta, + alpha=self.scale, + ) + dynamic_lambda = torch.tensor([1 + self.LAMBDA * (i/50) for i in range(self.num_frames)]).to(dtype).cuda() + attention_scores[:,:self.num_frames,0] = attention_scores1[:,:self.num_frames,0] * dynamic_lambda + + + del baddbmm_input + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = attention_scores.softmax(dim=-1) + + + if self.use_last_attn_slice: + self.use_last_attn_slice = False + + if self.save_last_attn_slice: + + self.last_attn_slice = [ + query, + key, + ] + + self.save_last_attn_slice = False + + + + del attention_scores + attention_probs = attention_probs.to(dtype) + + + return attention_probs + + + for _, module in unet.named_modules(): + module_name = type(module).__name__ + + if module_name == "Attention": + module.last_attn_slice = None + module.use_last_attn_slice = False + module.save_last_attn_slice = False + module.LAMBDA = 0 + module.get_attention_scores = get_attention_scores.__get__(module, type(module)) + + module.bs = 0 + module.num_frames = None + + return unet + + +def use_last_self_attention(unet, use=True): + for name, module in unet.named_modules(): + module_name = type(module).__name__ + if module_name == "Attention" and "attn1" in name: + module.use_last_attn_slice = use + +def save_last_self_attention(unet, save=True): + for name, module in unet.named_modules(): + module_name = type(module).__name__ + if module_name == "Attention" and "attn1" in name: + module.save_last_attn_slice = save + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import TextToVideoSDPipeline + >>> from diffusers.utils import export_to_video + + >>> pipe = TextToVideoSDPipeline.from_pretrained( + ... "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16" + ... ) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = "Spiderman is surfing" + >>> video_frames = pipe(prompt).frames[0] + >>> video_path = export_to_video(video_frames) + >>> video_path + ``` +""" + + +# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid +def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): + batch_size, channels, num_frames, height, width = video.shape + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = processor.postprocess(batch_vid, output_type) + + outputs.append(batch_output) + + if output_type == "np": + outputs = np.stack(outputs) + + elif output_type == "pt": + outputs = torch.stack(outputs) + + elif not output_type == "pil": + raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") + + return outputs + +from diffusers import TextToVideoSDPipeline +class TextToVideoSDPipelineModded(TextToVideoSDPipeline): + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet3DConditionModel, + scheduler: KarrasDiffusionSchedulers, + ): + super().__init__(vae, text_encoder, tokenizer, unet, scheduler) + + + def call_network(self, + negative_prompt_embeds, + prompt_embeds, + latents, + inv_latents, + t, + i, + null_embeds, + cross_attention_kwargs, + extra_step_kwargs, + do_classifier_free_guidance, + guidance_scale, + ): + + + inv_latent_model_input = inv_latents + inv_latent_model_input = self.scheduler.scale_model_input(inv_latent_model_input, t) + + latent_model_input = latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + + if do_classifier_free_guidance: + noise_pred_uncond = self.unet( + latent_model_input, + t, + encoder_hidden_states=negative_prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + noise_null_pred_uncond = self.unet( + inv_latent_model_input, + t, + encoder_hidden_states=negative_prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + + + if i<=TAU_2: + save_last_self_attention(self.unet) + + + noise_null_pred = self.unet( + inv_latent_model_input, + t, + encoder_hidden_states=null_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_null_pred = noise_null_pred_uncond + guidance_scale * (noise_null_pred - noise_null_pred_uncond) + + bsz, channel, frames, width, height = inv_latents.shape + + inv_latents = inv_latents.permute(0, 2, 1, 3, 4).reshape(bsz*frames, channel, height, width) + noise_null_pred = noise_null_pred.permute(0, 2, 1, 3, 4).reshape(bsz*frames, channel, height, width) + inv_latents = self.scheduler.step(noise_null_pred, t, inv_latents, **extra_step_kwargs).prev_sample + inv_latents = inv_latents[None, :].reshape((bsz, frames , -1) + inv_latents.shape[2:]).permute(0, 2, 1, 3, 4) + + use_last_self_attention(self.unet) + else: + noise_null_pred = None + + + + + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, # For unconditional guidance + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + use_last_self_attention(self.unet, False) + + + if do_classifier_free_guidance: + noise_pred_text = noise_pred + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # reshape latents + bsz, channel, frames, width, height = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + + + # reshape latents back + latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4) + + + return { + "latents": latents, + "inv_latents": inv_latents, + "noise_pred": noise_pred, + "noise_null_pred": noise_null_pred, + } + + def optimize_latents(self, latents, inv_latents, t, i, null_embeds, cross_attention_kwargs, prompt_embeds): + inv_scaled = self.scheduler.scale_model_input(inv_latents, t) + + noise_null_pred = self.unet( + inv_scaled[:,:,0:1,:,:], + t, + encoder_hidden_states=null_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + with torch.enable_grad(): + + latent_train = latents[:,:,1:,:,:].clone().detach().requires_grad_(True) + optimizer = torch.optim.Adam([latent_train], lr=1e-3) + + for j in range(10): + latent_in = torch.cat([inv_latents[:,:,0:1,:,:].detach(), latent_train], dim=2) + latent_input_unet = self.scheduler.scale_model_input(latent_in, t) + + noise_pred = self.unet( + latent_input_unet, + t, + encoder_hidden_states=prompt_embeds, # For unconditional guidance + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + loss = torch.nn.functional.mse_loss(noise_pred[:,:,0,:,:], noise_null_pred[:,:,0,:,:]) + + loss.backward() + + optimizer.step() + optimizer.zero_grad() + + print("Iteration {} Subiteration {} Loss {} ".format(i, j, loss.item())) + latents = latent_in.detach() + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_frames: int = 16, + num_inference_steps: int = 50, + guidance_scale: float = 9.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + inv_latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "np", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: Optional[int] = None, + lambda_ = 0.5, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated video. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated video. + num_frames (`int`, *optional*, defaults to 16): + The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds + amounts to 2 seconds of video. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality videos at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. Latents should be of shape + `(batch_size, num_channel, num_frames, height, width)`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + output_type (`str`, *optional*, defaults to `"np"`): + The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead + of a plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + Examples: + + Returns: + [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is + returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + num_images_per_prompt = 1 + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # # 2. Define call parameters + # if prompt is not None and isinstance(prompt, str): + # batch_size = 1 + # elif prompt is not None and isinstance(prompt, list): + # batch_size = len(prompt) + # else: + # batch_size = prompt_embeds.shape[0] + + batch_size = inv_latents.shape[0] + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + [prompt] * batch_size, + device, + num_images_per_prompt, + do_classifier_free_guidance, + [negative_prompt] * batch_size if negative_prompt is not None else None, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=clip_skip, + ) + null_embeds, negative_prompt_embeds = self.encode_prompt( + [""] * batch_size, + device, + num_images_per_prompt, + do_classifier_free_guidance, + [negative_prompt] * batch_size if negative_prompt is not None else None, + prompt_embeds=None, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=clip_skip, + ) + + + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + inv_latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + inv_latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + init_attention_func(self.unet) + print("Setup for Current Run") + print("----------------------") + print("Prompt ", prompt) + print("Batch size ", batch_size) + print("Num frames ", latents.shape[2]) + print("Lambda ", lambda_) + + init_attention_params(self.unet, num_frames=latents.shape[2], lambda_=lambda_, bs = batch_size) + + iters_to_alter = [-1]#i for i in range(0, TAU_1)] + + + with self.progress_bar(total=num_inference_steps) as progress_bar: + + mask_in = torch.zeros(latents.shape).to(dtype=latents.dtype, device=latents.device) + mask_in[:, :, 0, :, :] = 1 + assert latents.shape[0] == inv_latents.shape[0], "Latents and Inverse Latents should have the same batch but got {} and {}".format(latents.shape[0], inv_latents.shape[0]) + inv_latents = inv_latents.repeat(1,1,num_frames,1,1) + + latents = inv_latents * mask_in + latents * (1-mask_in) + + + + for i, t in enumerate(timesteps): + + curr_copy = max(1,num_frames - i) + inv_latents = inv_latents[:,:,:curr_copy, :, : ] + if i in iters_to_alter: + + latents = self.optimize_latents(latents, inv_latents, t, i, null_embeds, cross_attention_kwargs, prompt_embeds) + + + output_dict = self.call_network( + negative_prompt_embeds, + prompt_embeds, + latents, + inv_latents, + t, + i, + null_embeds, + cross_attention_kwargs, + extra_step_kwargs, + do_classifier_free_guidance, + guidance_scale, + ) + latents = output_dict["latents"] + inv_latents = output_dict["inv_latents"] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # 8. Post processing + if output_type == "latent": + video = latents + else: + video_tensor = self.decode_latents(latents) + video = tensor2vid(video_tensor, self.image_processor, output_type) + + # 9. Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (video,) + + return TextToVideoSDPipelineOutput(frames=video) \ No newline at end of file diff --git a/text2vid_modded_full.py b/text2vid_modded_full.py new file mode 100644 index 0000000000000000000000000000000000000000..9e4aa29c674991c41e724f751030e41481e4150d --- /dev/null +++ b/text2vid_modded_full.py @@ -0,0 +1,612 @@ +from typing import Any, Callable, Dict, List, Optional, Union +import numpy as np +import torch +from transformers import CLIPTextModel, CLIPTokenizer +from diffusers.image_processor import VaeImageProcessor +from diffusers.models import AutoencoderKL, UNet3DConditionModel +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + logging, + replace_example_docstring) +from diffusers.pipelines.text_to_video_synthesis import TextToVideoSDPipelineOutput + + + +TAU_2 = 15 +TAU_1 = 10 + + +def init_attention_params(unet, num_frames, lambda_=None, bs=None): + + + for name, module in unet.named_modules(): + module_name = type(module).__name__ + if module_name == "Attention": + module.LAMBDA = lambda_ + module.bs = bs + module.num_frames = num_frames + module.last_attn_slice_weights = 1 + +def init_attention_func(unet): + # ORIGINAL SOURCE CODE: https://github.com/huggingface/diffusers/blob/91ddd2a25b848df0fa1262d4f1cd98c7ccb87750/src/diffusers/models/attention.py#L276 + # Updated source code: https://github.com/huggingface/diffusers/blob/50296739878f3e17b2d25d45ef626318b44440b9/src/diffusers/models/attention_processor.py#L571 + def get_attention_scores( + self, query, key, attention_mask = None): + r""" + Compute the attention scores. + + Args: + query (`torch.Tensor`): The query tensor. + key (`torch.Tensor`): The key tensor. + attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied. + + Returns: + `torch.Tensor`: The attention probabilities/scores. + """ + + q_old = query.clone() + k_old = key.clone() + + if self.use_last_attn_slice: + if self.last_attn_slice is not None: + query_list = self.last_attn_slice[0] + key_list = self.last_attn_slice[1] + + if query.shape[1] == self.num_frames and query.shape == key.shape: + + key1 = key.clone() + key1[:,:1,:key_list.shape[2]] = key_list[:,:1] + + if q_old.shape == k_old.shape and q_old.shape[1]!=self.num_frames: + + batch_dim = query_list.shape[0] // self.bs + all_dim = query.shape[0] // self.bs + for i in range(self.bs): + query[i*all_dim:(i*all_dim) + batch_dim,:query_list.shape[1],:query_list.shape[2]] = query_list[i*batch_dim:(i+1)*batch_dim] + + + dtype = query.dtype + if self.upcast_attention: + query = query.float() + key = key.float() + + + if attention_mask is None: + baddbmm_input = torch.empty( + query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device + ) + beta = 0 + else: + baddbmm_input = attention_mask + beta = 1 + + + attention_scores = torch.baddbmm( + baddbmm_input, + query, + key.transpose(-1, -2), + beta=beta, + alpha=self.scale, + ) + + if query.shape[1] == self.num_frames and query.shape == key.shape and self.use_last_attn_slice: + attention_scores1 = torch.baddbmm( + baddbmm_input, + query, + key1.transpose(-1, -2), + beta=beta, + alpha=self.scale, + ) + dynamic_lambda = torch.tensor([1 + self.LAMBDA * (i/50) for i in range(self.num_frames)]).to(dtype).cuda() + attention_scores[:,:self.num_frames,0] = attention_scores1[:,:self.num_frames,0] * dynamic_lambda + + + del baddbmm_input + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = attention_scores.softmax(dim=-1) + + + if self.use_last_attn_slice: + self.use_last_attn_slice = False + + if self.save_last_attn_slice: + + self.last_attn_slice = [ + query, + key, + ] + + self.save_last_attn_slice = False + + + + del attention_scores + attention_probs = attention_probs.to(dtype) + + + return attention_probs + + + for _, module in unet.named_modules(): + module_name = type(module).__name__ + + if module_name == "Attention": + module.last_attn_slice = None + module.use_last_attn_slice = False + module.save_last_attn_slice = False + module.LAMBDA = 0 + module.get_attention_scores = get_attention_scores.__get__(module, type(module)) + + module.bs = 0 + module.num_frames = None + + return unet + + +def use_last_self_attention(unet, use=True): + for name, module in unet.named_modules(): + module_name = type(module).__name__ + if module_name == "Attention" and "attn1" in name: + module.use_last_attn_slice = use + +def save_last_self_attention(unet, save=True): + for name, module in unet.named_modules(): + module_name = type(module).__name__ + if module_name == "Attention" and "attn1" in name: + module.save_last_attn_slice = save + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import TextToVideoSDPipeline + >>> from diffusers.utils import export_to_video + + >>> pipe = TextToVideoSDPipeline.from_pretrained( + ... "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16" + ... ) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = "Spiderman is surfing" + >>> video_frames = pipe(prompt).frames[0] + >>> video_path = export_to_video(video_frames) + >>> video_path + ``` +""" + + +# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid +def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): + batch_size, channels, num_frames, height, width = video.shape + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = processor.postprocess(batch_vid, output_type) + + outputs.append(batch_output) + + if output_type == "np": + outputs = np.stack(outputs) + + elif output_type == "pt": + outputs = torch.stack(outputs) + + elif not output_type == "pil": + raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") + + return outputs + +from diffusers import TextToVideoSDPipeline +class TextToVideoSDPipelineModded(TextToVideoSDPipeline): + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet3DConditionModel, + scheduler: KarrasDiffusionSchedulers, + ): + super().__init__(vae, text_encoder, tokenizer, unet, scheduler) + + + def call_network(self, + negative_prompt_embeds, + prompt_embeds, + latents, + inv_latents, + t, + i, + null_embeds, + cross_attention_kwargs, + extra_step_kwargs, + do_classifier_free_guidance, + guidance_scale, + ): + + + inv_latent_model_input = inv_latents + inv_latent_model_input = self.scheduler.scale_model_input(inv_latent_model_input, t) + + latent_model_input = latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + + if do_classifier_free_guidance: + noise_pred_uncond = self.unet( + latent_model_input, + t, + encoder_hidden_states=negative_prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + noise_null_pred_uncond = self.unet( + inv_latent_model_input, + t, + encoder_hidden_states=negative_prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + + + if i<=TAU_2: + save_last_self_attention(self.unet) + + + noise_null_pred = self.unet( + inv_latent_model_input, + t, + encoder_hidden_states=null_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_null_pred = noise_null_pred_uncond + guidance_scale * (noise_null_pred - noise_null_pred_uncond) + + bsz, channel, frames, width, height = inv_latents.shape + + inv_latents = inv_latents.permute(0, 2, 1, 3, 4).reshape(bsz*frames, channel, height, width) + noise_null_pred = noise_null_pred.permute(0, 2, 1, 3, 4).reshape(bsz*frames, channel, height, width) + inv_latents = self.scheduler.step(noise_null_pred, t, inv_latents, **extra_step_kwargs).prev_sample + inv_latents = inv_latents[None, :].reshape((bsz, frames , -1) + inv_latents.shape[2:]).permute(0, 2, 1, 3, 4) + + use_last_self_attention(self.unet) + else: + noise_null_pred = None + + + + + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, # For unconditional guidance + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + use_last_self_attention(self.unet, False) + + + if do_classifier_free_guidance: + noise_pred_text = noise_pred + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # reshape latents + bsz, channel, frames, width, height = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + + + # reshape latents back + latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4) + + + return { + "latents": latents, + "inv_latents": inv_latents, + "noise_pred": noise_pred, + "noise_null_pred": noise_null_pred, + } + + def optimize_latents(self, latents, inv_latents, t, i, null_embeds, cross_attention_kwargs, prompt_embeds): + inv_scaled = self.scheduler.scale_model_input(inv_latents, t) + + noise_null_pred = self.unet( + inv_scaled[:,:,0:1,:,:], + t, + encoder_hidden_states=null_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + with torch.enable_grad(): + + latent_train = latents[:,:,1:,:,:].clone().detach().requires_grad_(True) + optimizer = torch.optim.Adam([latent_train], lr=1e-3) + + for j in range(10): + latent_in = torch.cat([inv_latents[:,:,0:1,:,:].detach(), latent_train], dim=2) + latent_input_unet = self.scheduler.scale_model_input(latent_in, t) + + noise_pred = self.unet( + latent_input_unet, + t, + encoder_hidden_states=prompt_embeds, # For unconditional guidance + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + loss = torch.nn.functional.mse_loss(noise_pred[:,:,0,:,:], noise_null_pred[:,:,0,:,:]) + + loss.backward() + + optimizer.step() + optimizer.zero_grad() + + print("Iteration {} Subiteration {} Loss {} ".format(i, j, loss.item())) + latents = latent_in.detach() + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_frames: int = 16, + num_inference_steps: int = 50, + guidance_scale: float = 9.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + inv_latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "np", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: Optional[int] = None, + lambda_ = 0.5, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated video. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated video. + num_frames (`int`, *optional*, defaults to 16): + The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds + amounts to 2 seconds of video. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality videos at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. Latents should be of shape + `(batch_size, num_channel, num_frames, height, width)`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + output_type (`str`, *optional*, defaults to `"np"`): + The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead + of a plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + Examples: + + Returns: + [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is + returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + num_images_per_prompt = 1 + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # # 2. Define call parameters + # if prompt is not None and isinstance(prompt, str): + # batch_size = 1 + # elif prompt is not None and isinstance(prompt, list): + # batch_size = len(prompt) + # else: + # batch_size = prompt_embeds.shape[0] + + batch_size = inv_latents.shape[0] + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + [prompt] * batch_size, + device, + num_images_per_prompt, + do_classifier_free_guidance, + [negative_prompt] * batch_size if negative_prompt is not None else None, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=clip_skip, + ) + null_embeds, negative_prompt_embeds = self.encode_prompt( + [""] * batch_size, + device, + num_images_per_prompt, + do_classifier_free_guidance, + [negative_prompt] * batch_size if negative_prompt is not None else None, + prompt_embeds=None, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=clip_skip, + ) + + + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + inv_latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + inv_latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + init_attention_func(self.unet) + print("Setup for Current Run") + print("----------------------") + print("Prompt ", prompt) + print("Batch size ", batch_size) + print("Num frames ", latents.shape[2]) + print("Lambda ", lambda_) + + init_attention_params(self.unet, num_frames=latents.shape[2], lambda_=lambda_, bs = batch_size) + + iters_to_alter = [i for i in range(0, TAU_1)] + + + with self.progress_bar(total=num_inference_steps) as progress_bar: + + mask_in = torch.zeros(latents.shape).to(dtype=latents.dtype, device=latents.device) + mask_in[:, :, 0, :, :] = 1 + assert latents.shape[0] == inv_latents.shape[0], "Latents and Inverse Latents should have the same batch but got {} and {}".format(latents.shape[0], inv_latents.shape[0]) + inv_latents = inv_latents.repeat(1,1,num_frames,1,1) + + latents = inv_latents * mask_in + latents * (1-mask_in) + + + + for i, t in enumerate(timesteps): + + curr_copy = max(1,num_frames - i) + inv_latents = inv_latents[:,:,:curr_copy, :, : ] + if i in iters_to_alter: + + latents = self.optimize_latents(latents, inv_latents, t, i, null_embeds, cross_attention_kwargs, prompt_embeds) + + + output_dict = self.call_network( + negative_prompt_embeds, + prompt_embeds, + latents, + inv_latents, + t, + i, + null_embeds, + cross_attention_kwargs, + extra_step_kwargs, + do_classifier_free_guidance, + guidance_scale, + ) + latents = output_dict["latents"] + inv_latents = output_dict["inv_latents"] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # 8. Post processing + if output_type == "latent": + video = latents + else: + video_tensor = self.decode_latents(latents) + video = tensor2vid(video_tensor, self.image_processor, output_type) + + # 9. Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (video,) + + return TextToVideoSDPipelineOutput(frames=video) \ No newline at end of file