diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..551c4bd3d2e99dffe8c4341d4099932a8e718744 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +test_imgs/ai-generated-8255456_1280.png filter=lfs diff=lfs merge=lfs -text +test_imgs/ai-generated-8489879_1280.png filter=lfs diff=lfs merge=lfs -text diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..b6b421220f403b8e7b8ce90c3d9d6db99ac3bc95 --- /dev/null +++ b/app.py @@ -0,0 +1,256 @@ +import gradio as gr + +# import gradio.helpers +import torch +import os +from glob import glob +from pathlib import Path +from typing import Optional + +from PIL import Image +from diffusers.utils import load_image, export_to_video +from pipeline import StableVideoDiffusionPipeline + +import random +from safetensors import safe_open +from lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler + + +def get_safetensors_files(): + models_dir = "./safetensors" + safetensors_files = [ + f for f in os.listdir(models_dir) if f.endswith(".safetensors") + ] + return safetensors_files + + +def model_select(selected_file): + print("load model weights", selected_file) + pipe.unet.cpu() + file_path = os.path.join("./safetensors", selected_file) + state_dict = {} + with safe_open(file_path, framework="pt", device="cpu") as f: + for key in f.keys(): + state_dict[key] = f.get_tensor(key) + missing, unexpected = pipe.unet.load_state_dict(state_dict, strict=True) + pipe.unet.cuda() + del state_dict + return + + +noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler( + num_train_timesteps=40, + sigma_min=0.002, + sigma_max=700.0, + sigma_data=1.0, + s_noise=1.0, + rho=7, + clip_denoised=False, +) +pipe = StableVideoDiffusionPipeline.from_pretrained( + "stabilityai/stable-video-diffusion-img2vid-xt", + scheduler=noise_scheduler, + torch_dtype=torch.float16, + variant="fp16", +) +pipe.to("cuda") +pipe.enable_model_cpu_offload() # for smaller cost +model_select("AnimateLCM-SVD-xt.safetensors") +# pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) # for faster inference + + +max_64_bit_int = 2**63 - 1 + + +def sample( + image: Image, + seed: Optional[int] = 42, + randomize_seed: bool = False, + motion_bucket_id: int = 80, + fps_id: int = 8, + max_guidance_scale: float = 1.2, + min_guidance_scale: float = 1, + width: int = 1024, + height: int = 576, + num_inference_steps: int = 4, + decoding_t: int = 4, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary. + output_folder: str = "outputs_gradio", +): + if image.mode == "RGBA": + image = image.convert("RGB") + + if randomize_seed: + seed = random.randint(0, max_64_bit_int) + generator = torch.manual_seed(seed) + + os.makedirs(output_folder, exist_ok=True) + base_count = len(glob(os.path.join(output_folder, "*.mp4"))) + video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") + + with torch.autocast("cuda"): + frames = pipe( + image, + decode_chunk_size=decoding_t, + generator=generator, + motion_bucket_id=motion_bucket_id, + height=height, + width=width, + num_inference_steps=num_inference_steps, + min_guidance_scale=min_guidance_scale, + max_guidance_scale=max_guidance_scale, + ).frames[0] + export_to_video(frames, video_path, fps=fps_id) + torch.manual_seed(seed) + + return video_path, seed + + +def resize_image(image, output_size=(1024, 576)): + # Calculate aspect ratios + target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size + image_aspect = image.width / image.height # Aspect ratio of the original image + + # Resize then crop if the original image is larger + if image_aspect > target_aspect: + # Resize the image to match the target height, maintaining aspect ratio + new_height = output_size[1] + new_width = int(new_height * image_aspect) + resized_image = image.resize((new_width, new_height), Image.LANCZOS) + # Calculate coordinates for cropping + left = (new_width - output_size[0]) / 2 + top = 0 + right = (new_width + output_size[0]) / 2 + bottom = output_size[1] + else: + # Resize the image to match the target width, maintaining aspect ratio + new_width = output_size[0] + new_height = int(new_width / image_aspect) + resized_image = image.resize((new_width, new_height), Image.LANCZOS) + # Calculate coordinates for cropping + left = 0 + top = (new_height - output_size[1]) / 2 + right = output_size[0] + bottom = (new_height + output_size[1]) / 2 + + # Crop the image + cropped_image = resized_image.crop((left, top, right, bottom)) + return cropped_image + + +with gr.Blocks() as demo: + gr.Markdown( + """ + # [AnimateLCM: Accelerating the Animation of Personalized Diffusion Models and Adapters with Decoupled Consistency Learning](https://arxiv.org/abs/2402.00769) + Fu-Yun Wang, Zhaoyang Huang (*Corresponding Author), Xiaoyu Shi, Weikang Bian, Guanglu Song, Yu Liu, Hongsheng Li (*Corresponding Author)
+ [arXiv Report](https://arxiv.org/abs/2402.00769) | [Project Page](https://animatelcm.github.io/) | [Github](https://github.com/G-U-N/AnimateLCM) | [Civitai](https://civitai.com/models/290375/animatelcm-fast-video-generation) | [Replicate](https://replicate.com/camenduru/animate-lcm) + Related Models: + [AnimateLCM-t2v](https://huggingface.co/wangfuyun/AnimateLCM): Personalized Text-to-Video Generation + [AnimateLCM-SVD-xt](https://huggingface.co/wangfuyun/AnimateLCM-SVD-xt): General Image-to-Video Generation + [AnimateLCM-i2v](https://huggingface.co/wangfuyun/AnimateLCM-I2V): Personalized Image-to-Video Generation + """ + ) + with gr.Row(): + with gr.Column(): + image = gr.Image(label="Upload your image", type="pil") + generate_btn = gr.Button("Generate") + video = gr.Video() + with gr.Accordion("Advanced options", open=False): + safetensors_dropdown = gr.Dropdown( + label="Choose Safetensors", choices=get_safetensors_files() + ) + seed = gr.Slider( + label="Seed", + value=42, + randomize=False, + minimum=0, + maximum=max_64_bit_int, + step=1, + ) + randomize_seed = gr.Checkbox(label="Randomize seed", value=False) + motion_bucket_id = gr.Slider( + label="Motion bucket id", + info="Controls how much motion to add/remove from the image", + value=80, + minimum=1, + maximum=255, + ) + fps_id = gr.Slider( + label="Frames per second", + info="The length of your video in seconds will be 25/fps", + value=8, + minimum=5, + maximum=30, + ) + width = gr.Slider( + label="Width of input image", + info="It should be divisible by 64", + value=1024, + minimum=576, + maximum=2048, + ) + height = gr.Slider( + label="Height of input image", + info="It should be divisible by 64", + value=576, + minimum=320, + maximum=1152, + ) + max_guidance_scale = gr.Slider( + label="Max guidance scale", + info="classifier-free guidance strength", + value=1.2, + minimum=1, + maximum=2, + ) + min_guidance_scale = gr.Slider( + label="Min guidance scale", + info="classifier-free guidance strength", + value=1, + minimum=1, + maximum=1.5, + ) + num_inference_steps = gr.Slider( + label="Num inference steps", + info="steps for inference", + value=4, + minimum=1, + maximum=20, + step=1, + ) + + image.upload(fn=resize_image, inputs=image, outputs=image, queue=False) + generate_btn.click( + fn=sample, + inputs=[ + image, + seed, + randomize_seed, + motion_bucket_id, + fps_id, + max_guidance_scale, + min_guidance_scale, + width, + height, + num_inference_steps, + ], + outputs=[video, seed], + api_name="video", + ) + safetensors_dropdown.change(fn=model_select, inputs=safetensors_dropdown) + + gr.Examples( + examples=[ + "test_imgs/ai-generated-8255456_1280.png", + "test_imgs/ai-generated-8496135_1280.jpg", + "test_imgs/dog-7396912_1280.jpg", + "test_imgs/ship-7833921_1280.jpg", + ], + inputs=image, + outputs=[video, seed], + fn=sample, + cache_examples=True, + ) + +if __name__ == "__main__": + demo.queue(max_size=20, api_open=False) + demo.launch(share=True, show_api=False) diff --git a/enviroment.yaml b/enviroment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9a2d3f582b3d32591efa3378b257da5e1ea3104b --- /dev/null +++ b/enviroment.yaml @@ -0,0 +1,125 @@ +name: svd +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - ca-certificates=2023.12.12=h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.12=h7f8727e_0 + - pip=23.3.1=py39h06a4308_0 + - python=3.9.18=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=68.2.2=py39h06a4308_0 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - wheel=0.41.2=py39h06a4308_0 + - xz=5.4.5=h5eee18b_0 + - zlib=1.2.13=h5eee18b_0 + - pip: + - accelerate==0.26.1 + - albumentations==1.3.1 + - antlr4-python3-runtime==4.9.3 + - appdirs==1.4.4 + - bitsandbytes==0.42.0 + - braceexpand==0.1.7 + - brotli==1.1.0 + - certifi==2023.11.17 + - cffi==1.16.0 + - charset-normalizer==3.3.2 + - click==8.1.7 + - dataclasses==0.6 + - decord==0.6.0 + - diffusers==0.25.1 + - docker-pycreds==0.4.0 + - docopt==0.6.2 + - einops==0.7.0 + - exifread-nocycle==3.0.1 + - ffmpeg-python==0.2.0 + - filelock==3.13.1 + - fire==0.5.0 + - fsspec==2023.12.2 + - future==0.18.3 + - gitdb==4.0.11 + - gitpython==3.1.41 + - huggingface-hub==0.20.3 + - idna==3.6 + - imageio==2.33.1 + - img2dataset==1.45.0 + - importlib-metadata==7.0.1 + - jinja2==3.1.3 + - joblib==1.3.2 + - langdetect==1.0.9 + - lazy-loader==0.3 + - markupsafe==2.1.4 + - mpmath==1.3.0 + - mutagen==1.47.0 + - networkx==3.2.1 + - numpy==1.26.3 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.19.3 + - nvidia-nvjitlink-cu12==12.3.101 + - nvidia-nvtx-cu12==12.1.105 + - omegaconf==2.3.0 + - opencv-python==4.9.0.80 + - opencv-python-headless==4.9.0.80 + - packaging==23.2 + - pandas==2.2.0 + - pillow==10.2.0 + - platformdirs==4.1.0 + - protobuf==4.25.2 + - psutil==5.9.8 + - pyarrow==15.0.0 + - pycparser==2.21 + - pycryptodomex==3.20.0 + - python-dateutil==2.8.2 + - pytz==2023.3.post1 + - pyyaml==6.0.1 + - qudida==0.0.4 + - regex==2023.12.25 + - requests==2.31.0 + - safetensors==0.4.2 + - scenedetect==0.6.2 + - scikit-image==0.22.0 + - scikit-learn==1.4.0 + - scipy==1.12.0 + - sentry-sdk==1.39.2 + - setproctitle==1.3.3 + - six==1.16.0 + - smmap==5.0.1 + - soundfile==0.12.1 + - sympy==1.12 + - termcolor==2.4.0 + - threadpoolctl==3.2.0 + - tifffile==2023.12.9 + - timeout-decorator==0.5.0 + - tokenizers==0.15.1 + - torch==2.2.0 + - torchdata==0.7.1 + - torchvision==0.17.0 + - tqdm==4.66.1 + - transformers==4.37.0 + - triton==2.2.0 + - typing-extensions==4.9.0 + - tzdata==2023.4 + - urllib3==2.1.0 + - wandb==0.16.2 + - webdataset==0.2.86 + - websockets==12.0 + - webvtt-py==0.4.6 + - xformers==0.0.24 + - yt-dlp==2023.12.30 + - zipp==3.17.0 diff --git a/gradio_cached_examples/19/component 0/3725061b1c373489a048/000003.mp4 b/gradio_cached_examples/19/component 0/3725061b1c373489a048/000003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f588d8bd00751003613521670313ae48493d42da Binary files /dev/null and b/gradio_cached_examples/19/component 0/3725061b1c373489a048/000003.mp4 differ diff --git a/gradio_cached_examples/19/component 0/a21547779ff20817de06/000002.mp4 b/gradio_cached_examples/19/component 0/a21547779ff20817de06/000002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fc23c00eff7d435f3a57aab056ef3f14ae411a63 Binary files /dev/null and b/gradio_cached_examples/19/component 0/a21547779ff20817de06/000002.mp4 differ diff --git a/gradio_cached_examples/19/component 0/ab669c2acaeb6f957c50/000001.mp4 b/gradio_cached_examples/19/component 0/ab669c2acaeb6f957c50/000001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..281fb7807b57615b4d975508820c6d14db0dea80 Binary files /dev/null and b/gradio_cached_examples/19/component 0/ab669c2acaeb6f957c50/000001.mp4 differ diff --git a/gradio_cached_examples/19/component 0/ceca750cda163ac6f548/000000.mp4 b/gradio_cached_examples/19/component 0/ceca750cda163ac6f548/000000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..43816daee2d7d2e499eeef041baf5f7c93e06753 Binary files /dev/null and b/gradio_cached_examples/19/component 0/ceca750cda163ac6f548/000000.mp4 differ diff --git a/gradio_cached_examples/19/log.csv b/gradio_cached_examples/19/log.csv new file mode 100644 index 0000000000000000000000000000000000000000..4825d5fa1278b320854c12b1d6ce2d7098736761 --- /dev/null +++ b/gradio_cached_examples/19/log.csv @@ -0,0 +1,5 @@ +component 0,Seed,flag,username,timestamp +"{""video"":{""path"":""gradio_cached_examples/19/component 0/ceca750cda163ac6f548/000000.mp4"",""url"":""/file=/tmp/gradio/2879d2ca96d58c8931b302213e276f4955b51afa/000000.mp4"",""size"":null,""orig_name"":""000000.mp4"",""mime_type"":null,""is_stream"":false},""subtitles"":null}",42,,,2024-02-25 17:49:43.926703 +"{""video"":{""path"":""gradio_cached_examples/19/component 0/ab669c2acaeb6f957c50/000001.mp4"",""url"":""/file=/tmp/gradio/3fe5de118a0bc4b9e389758b5bfb2a9682e9ec4f/000001.mp4"",""size"":null,""orig_name"":""000001.mp4"",""mime_type"":null,""is_stream"":false},""subtitles"":null}",42,,,2024-02-25 17:50:17.506490 +"{""video"":{""path"":""gradio_cached_examples/19/component 0/a21547779ff20817de06/000002.mp4"",""url"":""/file=/tmp/gradio/7bb404b88df0715738e14aa3e0d5d6975d90ad87/000002.mp4"",""size"":null,""orig_name"":""000002.mp4"",""mime_type"":null,""is_stream"":false},""subtitles"":null}",42,,,2024-02-25 17:50:51.099873 +"{""video"":{""path"":""gradio_cached_examples/19/component 0/3725061b1c373489a048/000003.mp4"",""url"":""/file=/tmp/gradio/515c7849b4b9b1de0fdded51f77517bd15f92734/000003.mp4"",""size"":null,""orig_name"":""000003.mp4"",""mime_type"":null,""is_stream"":false},""subtitles"":null}",42,,,2024-02-25 17:51:24.329419 diff --git a/lcm_scheduler.py b/lcm_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..86fdf64394224fddc44990685ae8fc66da174356 --- /dev/null +++ b/lcm_scheduler.py @@ -0,0 +1,468 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging +from diffusers.utils.torch_utils import randn_tensor +from diffusers.schedulers.scheduling_utils import SchedulerMixin + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class AnimateLCMSVDStochasticIterativeSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class AnimateLCMSVDStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): + """ + Multistep and onestep sampling for consistency models. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 40): + The number of diffusion steps to train the model. + sigma_min (`float`, defaults to 0.002): + Minimum noise magnitude in the sigma schedule. Defaults to 0.002 from the original implementation. + sigma_max (`float`, defaults to 80.0): + Maximum noise magnitude in the sigma schedule. Defaults to 80.0 from the original implementation. + sigma_data (`float`, defaults to 0.5): + The standard deviation of the data distribution from the EDM + [paper](https://huggingface.co/papers/2206.00364). Defaults to 0.5 from the original implementation. + s_noise (`float`, defaults to 1.0): + The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000, + 1.011]. Defaults to 1.0 from the original implementation. + rho (`float`, defaults to 7.0): + The parameter for calculating the Karras sigma schedule from the EDM + [paper](https://huggingface.co/papers/2206.00364). Defaults to 7.0 from the original implementation. + clip_denoised (`bool`, defaults to `True`): + Whether to clip the denoised outputs to `(-1, 1)`. + timesteps (`List` or `np.ndarray` or `torch.Tensor`, *optional*): + An explicit timestep schedule that can be optionally specified. The timesteps are expected to be in + increasing order. + """ + + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 40, + sigma_min: float = 0.002, + sigma_max: float = 80.0, + sigma_data: float = 0.5, + s_noise: float = 1.0, + rho: float = 7.0, + clip_denoised: bool = True, + ): + # standard deviation of the initial noise distribution + self.init_noise_sigma = (sigma_max**2 + 1) ** 0.5 + # self.init_noise_sigma = sigma_max + + ramp = np.linspace(0, 1, num_train_timesteps) + sigmas = self._convert_to_karras(ramp) + sigmas = np.concatenate([sigmas, np.array([0])]) + timesteps = self.sigma_to_t(sigmas) + + # setable values + self.num_inference_steps = None + self.sigmas = torch.from_numpy(sigmas) + self.timesteps = torch.from_numpy(timesteps) + self.custom_timesteps = False + self.is_scale_input_called = False + self._step_index = None + self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + return indices.item() + + @property + def step_index(self): + """ + The index counter for current timestep. It will increae 1 after each scheduler step. + """ + return self._step_index + + def scale_model_input( + self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] + ) -> torch.FloatTensor: + """ + Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`. + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`float` or `torch.FloatTensor`): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Get sigma corresponding to timestep + if self.step_index is None: + self._init_step_index(timestep) + + sigma = self.sigmas[self.step_index] + sample = sample / ((sigma**2 + self.config.sigma_data**2) ** 0.5) + + self.is_scale_input_called = True + return sample + + # def _sigma_to_t(self, sigma, log_sigmas): + # # get log sigma + # log_sigma = np.log(np.maximum(sigma, 1e-10)) + + # # get distribution + # dists = log_sigma - log_sigmas[:, np.newaxis] + + # # get sigmas range + # low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2) + # high_idx = low_idx + 1 + + # low = log_sigmas[low_idx] + # high = log_sigmas[high_idx] + + # # interpolate sigmas + # w = (low - log_sigma) / (low - high) + # w = np.clip(w, 0, 1) + + # # transform interpolation to time range + # t = (1 - w) * low_idx + w * high_idx + # t = t.reshape(sigma.shape) + # return t + + def sigma_to_t(self, sigmas: Union[float, np.ndarray]): + """ + Gets scaled timesteps from the Karras sigmas for input to the consistency model. + + Args: + sigmas (`float` or `np.ndarray`): + A single Karras sigma or an array of Karras sigmas. + + Returns: + `float` or `np.ndarray`: + A scaled input timestep or scaled input timestep array. + """ + if not isinstance(sigmas, np.ndarray): + sigmas = np.array(sigmas, dtype=np.float64) + + timesteps = 0.25 * np.log(sigmas + 1e-44) + + return timesteps + + def set_timesteps( + self, + num_inference_steps: Optional[int] = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + ): + """ + Sets the timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed, + `num_inference_steps` must be `None`. + """ + if num_inference_steps is None and timesteps is None: + raise ValueError( + "Exactly one of `num_inference_steps` or `timesteps` must be supplied." + ) + + if num_inference_steps is not None and timesteps is not None: + raise ValueError( + "Can only pass one of `num_inference_steps` or `timesteps`." + ) + + # Follow DDPMScheduler custom timesteps logic + if timesteps is not None: + for i in range(1, len(timesteps)): + if timesteps[i] >= timesteps[i - 1]: + raise ValueError("`timesteps` must be in descending order.") + + if timesteps[0] >= self.config.num_train_timesteps: + raise ValueError( + f"`timesteps` must start before `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps}." + ) + + timesteps = np.array(timesteps, dtype=np.int64) + self.custom_timesteps = True + else: + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + timesteps = ( + (np.arange(0, num_inference_steps) * step_ratio) + .round()[::-1] + .copy() + .astype(np.int64) + ) + self.custom_timesteps = False + + # Map timesteps to Karras sigmas directly for multistep sampling + # See https://github.com/openai/consistency_models/blob/main/cm/karras_diffusion.py#L675 + num_train_timesteps = self.config.num_train_timesteps + ramp = timesteps[::-1].copy() + ramp = ramp / (num_train_timesteps - 1) + sigmas = self._convert_to_karras(ramp) + timesteps = self.sigma_to_t(sigmas) + + sigmas = np.concatenate([sigmas, [0]]).astype(np.float32) + self.sigmas = torch.from_numpy(sigmas).to(device=device) + + if str(device).startswith("mps"): + # mps does not support float64 + self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32) + else: + self.timesteps = torch.from_numpy(timesteps).to(device=device) + + self._step_index = None + self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + + # Modified _convert_to_karras implementation that takes in ramp as argument + def _convert_to_karras(self, ramp): + """Constructs the noise schedule of Karras et al. (2022).""" + + sigma_min: float = self.config.sigma_min + sigma_max: float = self.config.sigma_max + + rho = self.config.rho + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho + return sigmas + + def get_scalings(self, sigma): + sigma_data = self.config.sigma_data + + c_skip = sigma_data**2 / (sigma**2 + sigma_data**2) + c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5 + return c_skip, c_out + + def get_scalings_for_boundary_condition(self, sigma): + """ + Gets the scalings used in the consistency model parameterization (from Appendix C of the + [paper](https://huggingface.co/papers/2303.01469)) to enforce boundary condition. + + + + `epsilon` in the equations for `c_skip` and `c_out` is set to `sigma_min`. + + + + Args: + sigma (`torch.FloatTensor`): + The current sigma in the Karras sigma schedule. + + Returns: + `tuple`: + A two-element tuple where `c_skip` (which weights the current sample) is the first element and `c_out` + (which weights the consistency model output) is the second element. + """ + sigma_min = self.config.sigma_min + sigma_data = self.config.sigma_data + + c_skip = sigma_data**2 / ((sigma) ** 2 + sigma_data**2) + c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5 + return c_skip, c_out + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index + def _init_step_index(self, timestep): + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + + index_candidates = (self.timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + if len(index_candidates) > 1: + step_index = index_candidates[1] + else: + step_index = index_candidates[0] + + self._step_index = step_index.item() + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[AnimateLCMSVDStochasticIterativeSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from the learned diffusion model. + timestep (`float`): + The current timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + generator (`torch.Generator`, *optional*): + A random number generator. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a + [`~schedulers.scheduling_consistency_models.AnimateLCMSVDStochasticIterativeSchedulerOutput`] or `tuple`. + + Returns: + [`~schedulers.scheduling_consistency_models.AnimateLCMSVDStochasticIterativeSchedulerOutput`] or `tuple`: + If return_dict is `True`, + [`~schedulers.scheduling_consistency_models.AnimateLCMSVDStochasticIterativeSchedulerOutput`] is returned, + otherwise a tuple is returned where the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + f" `{self.__class__}.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if not self.is_scale_input_called: + logger.warning( + "The `scale_model_input` function should be called before `step` to ensure correct denoising. " + "See `StableDiffusionPipeline` for a usage example." + ) + + sigma_min = self.config.sigma_min + sigma_max = self.config.sigma_max + + if self.step_index is None: + self._init_step_index(timestep) + + # sigma_next corresponds to next_t in original implementation + sigma = self.sigmas[self.step_index] + if self.step_index + 1 < self.config.num_train_timesteps: + sigma_next = self.sigmas[self.step_index + 1] + else: + # Set sigma_next to sigma_min + sigma_next = self.sigmas[-1] + + # Get scalings for boundary conditions + + c_skip, c_out = self.get_scalings_for_boundary_condition(sigma) + + # 1. Denoise model output using boundary conditions + denoised = c_out * model_output + c_skip * sample + if self.config.clip_denoised: + denoised = denoised.clamp(-1, 1) + + # 2. Sample z ~ N(0, s_noise^2 * I) + # Noise is not used for onestep sampling. + if len(self.timesteps) > 1: + noise = randn_tensor( + model_output.shape, + dtype=model_output.dtype, + device=model_output.device, + generator=generator, + ) + else: + noise = torch.zeros_like(model_output) + z = noise * self.config.s_noise + + sigma_hat = sigma_next.clamp(min=0, max=sigma_max) + + print("denoise currently") + print(sigma_hat) + + # origin + prev_sample = denoised + z * sigma_hat + + # upon completion increase step index by one + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return AnimateLCMSVDStochasticIterativeSchedulerOutput(prev_sample=prev_sample) + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + timesteps: torch.FloatTensor, + ) -> torch.FloatTensor: + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to( + device=original_samples.device, dtype=original_samples.dtype + ) + if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): + # mps does not support float64 + schedule_timesteps = self.timesteps.to( + original_samples.device, dtype=torch.float32 + ) + timesteps = timesteps.to(original_samples.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(original_samples.device) + timesteps = timesteps.to(original_samples.device) + + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(original_samples.shape): + sigma = sigma.unsqueeze(-1) + + noisy_samples = original_samples + noise * sigma + return noisy_samples + + def __len__(self): + return self.config.num_train_timesteps diff --git a/outputs_gradio/000000.mp4 b/outputs_gradio/000000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..43816daee2d7d2e499eeef041baf5f7c93e06753 Binary files /dev/null and b/outputs_gradio/000000.mp4 differ diff --git a/outputs_gradio/000001.mp4 b/outputs_gradio/000001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..281fb7807b57615b4d975508820c6d14db0dea80 Binary files /dev/null and b/outputs_gradio/000001.mp4 differ diff --git a/outputs_gradio/000002.mp4 b/outputs_gradio/000002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fc23c00eff7d435f3a57aab056ef3f14ae411a63 Binary files /dev/null and b/outputs_gradio/000002.mp4 differ diff --git a/outputs_gradio/000003.mp4 b/outputs_gradio/000003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f588d8bd00751003613521670313ae48493d42da Binary files /dev/null and b/outputs_gradio/000003.mp4 differ diff --git a/outputs_gradio/000004.mp4 b/outputs_gradio/000004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fbc46167002cc377797a0d5279ca73ec562fafdc Binary files /dev/null and b/outputs_gradio/000004.mp4 differ diff --git a/outputs_gradio/000005.mp4 b/outputs_gradio/000005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..468c8b3c56e66d27ed50e6603eabb7e610f3da14 Binary files /dev/null and b/outputs_gradio/000005.mp4 differ diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..f3c8237d3dbb3141a9b2014553ee909b21cd17d4 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,711 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from dataclasses import dataclass +from typing import Callable, Dict, List, Optional, Union + +import numpy as np +import PIL.Image +import torch +from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection + +from diffusers.image_processor import VaeImageProcessor +from diffusers.models import ( + AutoencoderKLTemporalDecoder, + UNetSpatioTemporalConditionModel, +) +from diffusers.schedulers import EulerDiscreteScheduler +from diffusers.utils import BaseOutput, logging +from diffusers.utils.torch_utils import is_compiled_module, randn_tensor +from diffusers.pipelines.pipeline_utils import DiffusionPipeline + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def _append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError( + f"input has {x.ndim} dims but target_dims is {target_dims}, which is less" + ) + return x[(...,) + (None,) * dims_to_append] + + +def tensor2vid(video: torch.Tensor, processor, output_type="np"): + # Based on: + # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 + + batch_size, channels, num_frames, height, width = video.shape + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = processor.postprocess(batch_vid, output_type) + + outputs.append(batch_output) + + return outputs + + +@dataclass +class StableVideoDiffusionPipelineOutput(BaseOutput): + r""" + Output class for zero-shot text-to-video pipeline. + + Args: + frames (`[List[PIL.Image.Image]`, `np.ndarray`]): + List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width, + num_channels)`. + """ + + frames: Union[List[PIL.Image.Image], np.ndarray] + + +class StableVideoDiffusionPipeline(DiffusionPipeline): + r""" + Pipeline to generate video from an input image using Stable Video Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + image_encoder ([`~transformers.CLIPVisionModelWithProjection`]): + Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). + unet ([`UNetSpatioTemporalConditionModel`]): + A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents. + scheduler ([`EulerDiscreteScheduler`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images. + """ + + model_cpu_offload_seq = "image_encoder->unet->vae" + _callback_tensor_inputs = ["latents"] + + def __init__( + self, + vae: AutoencoderKLTemporalDecoder, + image_encoder: CLIPVisionModelWithProjection, + unet: UNetSpatioTemporalConditionModel, + scheduler: EulerDiscreteScheduler, + feature_extractor: CLIPImageProcessor, + ): + super().__init__() + + self.register_modules( + vae=vae, + image_encoder=image_encoder, + unet=unet, + scheduler=scheduler, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + def _encode_image( + self, image, device, num_videos_per_prompt, do_classifier_free_guidance + ): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.image_processor.pil_to_numpy(image) + image = self.image_processor.numpy_to_pt(image) + + # We normalize the image before resizing to match with the original implementation. + # Then we unnormalize it after resizing. + image = image * 2.0 - 1.0 + image = _resize_with_antialiasing(image, (224, 224)) + image = (image + 1.0) / 2.0 + + # Normalize the image with for CLIP input + image = self.feature_extractor( + images=image, + do_normalize=True, + do_center_crop=False, + do_resize=False, + do_rescale=False, + return_tensors="pt", + ).pixel_values + + image = image.to(device=device, dtype=dtype) + image_embeddings = self.image_encoder(image).image_embeds + image_embeddings = image_embeddings.unsqueeze(1) + + # duplicate image embeddings for each generation per prompt, using mps friendly method + bs_embed, seq_len, _ = image_embeddings.shape + image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1) + image_embeddings = image_embeddings.view( + bs_embed * num_videos_per_prompt, seq_len, -1 + ) + + if do_classifier_free_guidance: + negative_image_embeddings = torch.zeros_like(image_embeddings) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + image_embeddings = torch.cat([negative_image_embeddings, image_embeddings]) + + return image_embeddings + + def _encode_vae_image( + self, + image: torch.Tensor, + device, + num_videos_per_prompt, + do_classifier_free_guidance, + ): + image = image.to(device=device) + image_latents = self.vae.encode(image).latent_dist.mode() + + if do_classifier_free_guidance: + negative_image_latents = torch.zeros_like(image_latents) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + image_latents = torch.cat([negative_image_latents, image_latents]) + + # duplicate image_latents for each generation per prompt, using mps friendly method + image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1) + + return image_latents + + def _get_add_time_ids( + self, + fps, + motion_bucket_id, + noise_aug_strength, + dtype, + batch_size, + num_videos_per_prompt, + do_classifier_free_guidance, + ): + add_time_ids = [fps, motion_bucket_id, noise_aug_strength] + + passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len( + add_time_ids + ) + expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + if expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1) + + if do_classifier_free_guidance: + add_time_ids = torch.cat([add_time_ids, add_time_ids]) + + return add_time_ids + + def decode_latents(self, latents, num_frames, decode_chunk_size=14): + # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] + latents = latents.flatten(0, 1) + + latents = 1 / self.vae.config.scaling_factor * latents + + forward_vae_fn = ( + self.vae._orig_mod.forward + if is_compiled_module(self.vae) + else self.vae.forward + ) + accepts_num_frames = "num_frames" in set( + inspect.signature(forward_vae_fn).parameters.keys() + ) + + # decode decode_chunk_size frames at a time to avoid OOM + frames = [] + for i in range(0, latents.shape[0], decode_chunk_size): + num_frames_in = latents[i : i + decode_chunk_size].shape[0] + decode_kwargs = {} + if accepts_num_frames: + # we only pass num_frames_in if it's expected + decode_kwargs["num_frames"] = num_frames_in + + frame = self.vae.decode( + latents[i : i + decode_chunk_size], **decode_kwargs + ).sample + frames.append(frame) + frames = torch.cat(frames, dim=0) + + # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width] + frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute( + 0, 2, 1, 3, 4 + ) + + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + frames = frames.float() + return frames + + def check_inputs(self, image, height, width): + if ( + not isinstance(image, torch.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): + raise ValueError( + "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + f" {type(image)}" + ) + + if height % 8 != 0 or width % 8 != 0: + raise ValueError( + f"`height` and `width` have to be divisible by 8 but are {height} and {width}." + ) + + def prepare_latents( + self, + batch_size, + num_frames, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + ): + shape = ( + batch_size, + num_frames, + num_channels_latents // 2, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor( + shape, generator=generator, device=device, dtype=dtype + ) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @property + def guidance_scale(self): + return self._guidance_scale + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + if isinstance(self.guidance_scale, (int, float)): + return self.guidance_scale > 1 + return self.guidance_scale.max() > 1 + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() + def __call__( + self, + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + height: int = 576, + width: int = 1024, + num_frames: Optional[int] = None, + num_inference_steps: int = 25, + min_guidance_scale: float = 1.0, + max_guidance_scale: float = 3.0, + fps: int = 7, + motion_bucket_id: int = 127, + noise_aug_strength: int = 0.02, + decode_chunk_size: Optional[int] = None, + num_videos_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + return_dict: bool = True, + ): + r""" + The call function to the pipeline for generation. + + Args: + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + Image or images to guide image generation. If you provide a tensor, it needs to be compatible with + [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_frames (`int`, *optional*): + The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt` + num_inference_steps (`int`, *optional*, defaults to 25): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. This parameter is modulated by `strength`. + min_guidance_scale (`float`, *optional*, defaults to 1.0): + The minimum guidance scale. Used for the classifier free guidance with first frame. + max_guidance_scale (`float`, *optional*, defaults to 3.0): + The maximum guidance scale. Used for the classifier free guidance with last frame. + fps (`int`, *optional*, defaults to 7): + Frames per second. The rate at which the generated images shall be exported to a video after generation. + Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. + motion_bucket_id (`int`, *optional*, defaults to 127): + The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video. + noise_aug_strength (`int`, *optional*, defaults to 0.02): + The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion. + decode_chunk_size (`int`, *optional*): + The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency + between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once + for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + + Returns: + [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list of list with the generated frames. + + Examples: + + ```py + from diffusers import StableVideoDiffusionPipeline + from diffusers.utils import load_image, export_to_video + + pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16") + pipe.to("cuda") + + image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200") + image = image.resize((1024, 576)) + + frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0] + export_to_video(frames, "generated.mp4", fps=7) + ``` + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + num_frames = ( + num_frames if num_frames is not None else self.unet.config.num_frames + ) + decode_chunk_size = ( + decode_chunk_size if decode_chunk_size is not None else num_frames + ) + + # 1. Check inputs. Raise error if not correct + self.check_inputs(image, height, width) + + # 2. Define call parameters + if isinstance(image, PIL.Image.Image): + batch_size = 1 + elif isinstance(image, list): + batch_size = len(image) + else: + batch_size = image.shape[0] + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + self._guidance_scale = max_guidance_scale + + # 3. Encode input image + image_embeddings = self._encode_image( + image, device, num_videos_per_prompt, self.do_classifier_free_guidance + ) + + # NOTE: Stable Diffusion Video was conditioned on fps - 1, which + # is why it is reduced here. + # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188 + fps = fps - 1 + + # 4. Encode input image using VAE + image = self.image_processor.preprocess(image, height=height, width=width) + noise = randn_tensor( + image.shape, generator=generator, device=image.device, dtype=image.dtype + ) + image = image + noise_aug_strength * noise + + needs_upcasting = ( + self.vae.dtype == torch.float16 and self.vae.config.force_upcast + ) + if needs_upcasting: + self.vae.to(dtype=torch.float32) + + image_latents = self._encode_vae_image( + image, device, num_videos_per_prompt, self.do_classifier_free_guidance + ) + image_latents = image_latents.to(image_embeddings.dtype) + + # cast back to fp16 if needed + if needs_upcasting: + self.vae.to(dtype=torch.float16) + + # Repeat the image latents for each frame so we can concatenate them with the noise + # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width] + image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1) + + # 5. Get Added Time IDs + added_time_ids = self._get_add_time_ids( + fps, + motion_bucket_id, + noise_aug_strength, + image_embeddings.dtype, + batch_size, + num_videos_per_prompt, + self.do_classifier_free_guidance, + ) + added_time_ids = added_time_ids.to(device) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + print("converted after karras", self.scheduler.sigmas) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_videos_per_prompt, + num_frames, + num_channels_latents, + height, + width, + image_embeddings.dtype, + device, + generator, + latents, + ) + + # 7. Prepare guidance scale + guidance_scale = torch.linspace( + min_guidance_scale, max_guidance_scale, num_frames + ).unsqueeze(0) + guidance_scale = guidance_scale.to(device, latents.dtype) + guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1) + guidance_scale = _append_dims(guidance_scale, latents.ndim) + + self._guidance_scale = guidance_scale + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = ( + torch.cat([latents] * 2) + if self.do_classifier_free_guidance + else latents + ) + latent_model_input = self.scheduler.scale_model_input( + latent_model_input, t + ) + + # Concatenate image_latents over channels dimention + latent_model_input = torch.cat( + [latent_model_input, image_latents], dim=2 + ) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=image_embeddings, + added_time_ids=added_time_ids, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * ( + noise_pred_cond - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + + if not output_type == "latent": + # cast back to fp16 if needed + if needs_upcasting: + self.vae.to(dtype=torch.float16) + frames = self.decode_latents(latents, num_frames, decode_chunk_size) + frames = tensor2vid(frames, self.image_processor, output_type=output_type) + else: + frames = latents + + self.maybe_free_model_hooks() + + if not return_dict: + return frames + + return StableVideoDiffusionPipelineOutput(frames=frames) + + +# resizing utils +# TODO: clean up later +def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True): + h, w = input.shape[-2:] + factors = (h / size[0], w / size[1]) + + # First, we have to determine sigma + # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171 + sigmas = ( + max((factors[0] - 1.0) / 2.0, 0.001), + max((factors[1] - 1.0) / 2.0, 0.001), + ) + + # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma + # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206 + # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + + # Make sure it is odd + if (ks[0] % 2) == 0: + ks = ks[0] + 1, ks[1] + + if (ks[1] % 2) == 0: + ks = ks[0], ks[1] + 1 + + input = _gaussian_blur2d(input, ks, sigmas) + + output = torch.nn.functional.interpolate( + input, size=size, mode=interpolation, align_corners=align_corners + ) + return output + + +def _compute_padding(kernel_size): + """Compute padding tuple.""" + # 4 or 6 ints: (padding_left, padding_right,padding_top,padding_bottom) + # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad + if len(kernel_size) < 2: + raise AssertionError(kernel_size) + computed = [k - 1 for k in kernel_size] + + # for even kernels we need to do asymmetric padding :( + out_padding = 2 * len(kernel_size) * [0] + + for i in range(len(kernel_size)): + computed_tmp = computed[-(i + 1)] + + pad_front = computed_tmp // 2 + pad_rear = computed_tmp - pad_front + + out_padding[2 * i + 0] = pad_front + out_padding[2 * i + 1] = pad_rear + + return out_padding + + +def _filter2d(input, kernel): + # prepare kernel + b, c, h, w = input.shape + tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype) + + tmp_kernel = tmp_kernel.expand(-1, c, -1, -1) + + height, width = tmp_kernel.shape[-2:] + + padding_shape: list[int] = _compute_padding([height, width]) + input = torch.nn.functional.pad(input, padding_shape, mode="reflect") + + # kernel and input tensor reshape to align element-wise or batch-wise params + tmp_kernel = tmp_kernel.reshape(-1, 1, height, width) + input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1)) + + # convolve the tensor with the kernel. + output = torch.nn.functional.conv2d( + input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1 + ) + + out = output.view(b, c, h, w) + return out + + +def _gaussian(window_size: int, sigma): + if isinstance(sigma, float): + sigma = torch.tensor([[sigma]]) + + batch_size = sigma.shape[0] + + x = ( + torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) + - window_size // 2 + ).expand(batch_size, -1) + + if window_size % 2 == 0: + x = x + 0.5 + + gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0))) + + return gauss / gauss.sum(-1, keepdim=True) + + +def _gaussian_blur2d(input, kernel_size, sigma): + if isinstance(sigma, tuple): + sigma = torch.tensor([sigma], dtype=input.dtype) + else: + sigma = sigma.to(dtype=input.dtype) + + ky, kx = int(kernel_size[0]), int(kernel_size[1]) + bs = sigma.shape[0] + kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1)) + kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1)) + out_x = _filter2d(input, kernel_x[..., None, :]) + out = _filter2d(out_x, kernel_y[..., None]) + + return out diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d2fca48ce0b92c502a953e7f6e2e30f95ab3d49 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +diffusers==0.25.1 +gradio==4.19.2 +Pillow==10.2.0 +torch==2.2.0 diff --git a/safetensors/AnimateLCM-SVD-xt-1.1.safetensors b/safetensors/AnimateLCM-SVD-xt-1.1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eab03ff6b2ce04f2445c7a0ad9d03de71c3b2b78 --- /dev/null +++ b/safetensors/AnimateLCM-SVD-xt-1.1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94b2da1fcca8d03458ef0b07b94b4c55f94117cb0d90265c6c8452239ecc166e +size 6098682464 diff --git a/safetensors/AnimateLCM-SVD-xt.safetensors b/safetensors/AnimateLCM-SVD-xt.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fe2b1a02f682f4c9361c6dd6bb0591071f3a7ddd --- /dev/null +++ b/safetensors/AnimateLCM-SVD-xt.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ca55e35f29437e8a65e8a1a9ce75262d5bab3d4fe137bdc3f3a94512c54b377 +size 6098682464 diff --git a/test_imgs/ai-generated-8255456_1280.png b/test_imgs/ai-generated-8255456_1280.png new file mode 100644 index 0000000000000000000000000000000000000000..7d09d83714342b6755cf4865d21835e43252b848 --- /dev/null +++ b/test_imgs/ai-generated-8255456_1280.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e36d82e13141d8fc8ead89d57f209a9d27d5768cd1b92323b0bf51e465a7968 +size 1435555 diff --git a/test_imgs/ai-generated-8411866_1280.jpg b/test_imgs/ai-generated-8411866_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e3499e57a0b97778617945a9df6042d228e4f645 Binary files /dev/null and b/test_imgs/ai-generated-8411866_1280.jpg differ diff --git a/test_imgs/ai-generated-8463496_1280.jpg b/test_imgs/ai-generated-8463496_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ad33e367f02a5d7de3c70dd7d991d9b0e9fbd307 Binary files /dev/null and b/test_imgs/ai-generated-8463496_1280.jpg differ diff --git a/test_imgs/ai-generated-8476858_1280.png b/test_imgs/ai-generated-8476858_1280.png new file mode 100644 index 0000000000000000000000000000000000000000..c577ab9e6fc279389d42967b6b01121ab6b5451b Binary files /dev/null and b/test_imgs/ai-generated-8476858_1280.png differ diff --git a/test_imgs/ai-generated-8479572_1280.jpg b/test_imgs/ai-generated-8479572_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d2a17c871360673813d5a9232d2388f2916f20cc Binary files /dev/null and b/test_imgs/ai-generated-8479572_1280.jpg differ diff --git a/test_imgs/ai-generated-8481641_1280.jpg b/test_imgs/ai-generated-8481641_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5f79dcab2e48a7d245c8ca747c5ae511e3a224af Binary files /dev/null and b/test_imgs/ai-generated-8481641_1280.jpg differ diff --git a/test_imgs/ai-generated-8489879_1280.png b/test_imgs/ai-generated-8489879_1280.png new file mode 100644 index 0000000000000000000000000000000000000000..9e49c142fb20eeb13d20a1ffa1e32607b7e1c55c --- /dev/null +++ b/test_imgs/ai-generated-8489879_1280.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a0ecd8306216a74ae6e19b63753284ee4c58f8dba4c09d8b3091c1f8353745c +size 1532997 diff --git a/test_imgs/ai-generated-8496135_1280.jpg b/test_imgs/ai-generated-8496135_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cb83d64b8c99f4e95013593d48e14734c0a21bba Binary files /dev/null and b/test_imgs/ai-generated-8496135_1280.jpg differ diff --git a/test_imgs/ai-generated-8496952_1280.jpg b/test_imgs/ai-generated-8496952_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dbc90593d20100a064fc345d2721bc4713983f60 Binary files /dev/null and b/test_imgs/ai-generated-8496952_1280.jpg differ diff --git a/test_imgs/ai-generated-8498844_1280.jpg b/test_imgs/ai-generated-8498844_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..362a8c85aedc668d930ac6e3f5a6ec27b8772445 Binary files /dev/null and b/test_imgs/ai-generated-8498844_1280.jpg differ diff --git a/test_imgs/bird-7411270_1280.jpg b/test_imgs/bird-7411270_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9c1a0b21ffaabbcfe68f1d6e89bb5e6a3c64cee2 Binary files /dev/null and b/test_imgs/bird-7411270_1280.jpg differ diff --git a/test_imgs/bird-7586857_1280.jpg b/test_imgs/bird-7586857_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d2ba1e28fb89d2015d881ef5e8b1b6e3de3bf274 Binary files /dev/null and b/test_imgs/bird-7586857_1280.jpg differ diff --git a/test_imgs/bird-8014191_1280.jpg b/test_imgs/bird-8014191_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e6fb79fb14d849a446fdfe6d4eecc46fc08a46d3 Binary files /dev/null and b/test_imgs/bird-8014191_1280.jpg differ diff --git a/test_imgs/couple-8019370_1280.jpg b/test_imgs/couple-8019370_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d04de857d7bfcf3242d836ee93d656cc93215d7f Binary files /dev/null and b/test_imgs/couple-8019370_1280.jpg differ diff --git a/test_imgs/cupcakes-380178_1280.jpg b/test_imgs/cupcakes-380178_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..50d7f5e7b05d07e47335a3888d133e6c2c7f2bbf Binary files /dev/null and b/test_imgs/cupcakes-380178_1280.jpg differ diff --git a/test_imgs/dog-7330712_1280.jpg b/test_imgs/dog-7330712_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8f25b3e196a57ba7ff84eb1c36bd7670b27d3df9 Binary files /dev/null and b/test_imgs/dog-7330712_1280.jpg differ diff --git a/test_imgs/dog-7396912_1280.jpg b/test_imgs/dog-7396912_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4a5d7de90bca969d5b199de55b6ed7c6f745b875 Binary files /dev/null and b/test_imgs/dog-7396912_1280.jpg differ diff --git a/test_imgs/girl-4898696_1280.jpg b/test_imgs/girl-4898696_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5ddf6e7ec85506c35203a76a92b7fb5e4796a38e Binary files /dev/null and b/test_imgs/girl-4898696_1280.jpg differ diff --git a/test_imgs/grey-capped-flycatcher-8071233_1280.jpg b/test_imgs/grey-capped-flycatcher-8071233_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..23e775d74fe96692c4944043af4979cabf025434 Binary files /dev/null and b/test_imgs/grey-capped-flycatcher-8071233_1280.jpg differ diff --git a/test_imgs/halloween-4585684_1280.jpg b/test_imgs/halloween-4585684_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..922f804b96bcc0043f7cc59c3bca164afa266764 Binary files /dev/null and b/test_imgs/halloween-4585684_1280.jpg differ diff --git a/test_imgs/leaf-7260246_1280.jpg b/test_imgs/leaf-7260246_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f8a38c64085f3a4a291cca736e8c7dadda06897c Binary files /dev/null and b/test_imgs/leaf-7260246_1280.jpg differ diff --git a/test_imgs/meerkat-7465819_1280.jpg b/test_imgs/meerkat-7465819_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..04924224b5344414696dd054675979c0a5673851 Binary files /dev/null and b/test_imgs/meerkat-7465819_1280.jpg differ diff --git a/test_imgs/mobile-phone-1875813_1280.jpg b/test_imgs/mobile-phone-1875813_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5e6c04b9e32159526dea321153e21d60122cb4f2 Binary files /dev/null and b/test_imgs/mobile-phone-1875813_1280.jpg differ diff --git a/test_imgs/mother-8097324_1280.jpg b/test_imgs/mother-8097324_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2488fe9c44b1eefb9c6754ca3c9a0b4622ebf9fd Binary files /dev/null and b/test_imgs/mother-8097324_1280.jpg differ diff --git a/test_imgs/plane-8145957_1280.jpg b/test_imgs/plane-8145957_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d449dffa0b4de0ad2292c2d6c49e14ca12a2773b Binary files /dev/null and b/test_imgs/plane-8145957_1280.jpg differ diff --git a/test_imgs/power-station-6579092_1280.jpg b/test_imgs/power-station-6579092_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1353727f78eaeb1fda7c677fb411566aa24a161d Binary files /dev/null and b/test_imgs/power-station-6579092_1280.jpg differ diff --git a/test_imgs/ship-7833921_1280.jpg b/test_imgs/ship-7833921_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7cede4cfe78e38a54e162063ec6ddc448091859d Binary files /dev/null and b/test_imgs/ship-7833921_1280.jpg differ diff --git a/test_imgs/sleep-7871915_1280.jpg b/test_imgs/sleep-7871915_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6dc671d5f2b7c657f920accbd037e7628d674f6c Binary files /dev/null and b/test_imgs/sleep-7871915_1280.jpg differ diff --git a/test_imgs/squirrel-7985502_1280.jpg b/test_imgs/squirrel-7985502_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dc2827451968dbe27fc5ca8a77e056874b550184 Binary files /dev/null and b/test_imgs/squirrel-7985502_1280.jpg differ diff --git a/test_imgs/squirrel-8211238_1280.jpg b/test_imgs/squirrel-8211238_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b067c96b7f38dd53f538e519d5e6b9472793ae95 Binary files /dev/null and b/test_imgs/squirrel-8211238_1280.jpg differ diff --git a/test_imgs/training-8122941_1280.jpg b/test_imgs/training-8122941_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5d072b1ef97e43ff56b268325062bbef22ea513f Binary files /dev/null and b/test_imgs/training-8122941_1280.jpg differ diff --git a/test_imgs/violin-8405558_1280.jpg b/test_imgs/violin-8405558_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fe9d2b007a2ec4836f6f587603f4883ba9dbd6d8 Binary files /dev/null and b/test_imgs/violin-8405558_1280.jpg differ diff --git a/test_imgs/weight-8246973_1280.jpg b/test_imgs/weight-8246973_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ea70560d625225e8ccc769717dff3a30c23872f2 Binary files /dev/null and b/test_imgs/weight-8246973_1280.jpg differ diff --git a/test_imgs/woman-4549327_1280.jpg b/test_imgs/woman-4549327_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..67fb4c96444d5cf841c1df2624f23edfadbcea8a Binary files /dev/null and b/test_imgs/woman-4549327_1280.jpg differ diff --git a/test_imgs/woman-4757707_1280.jpg b/test_imgs/woman-4757707_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..37a9ea8dfcc70a8f1404a6c723de41046a703a4f Binary files /dev/null and b/test_imgs/woman-4757707_1280.jpg differ diff --git a/test_imgs/woman-5667299_1280.jpg b/test_imgs/woman-5667299_1280.jpg new file mode 100644 index 0000000000000000000000000000000000000000..553146a82986b26a89fec340f87a7f4475ed9668 Binary files /dev/null and b/test_imgs/woman-5667299_1280.jpg differ