Spaces:

XCLiu
/

InstaFlow

Runtime error

App Files Files Community

XCLiu commited on Sep 22, 2023

Commit

aeed1bf

1 Parent(s): 28e87bc

Upload 2 files

Browse files

Files changed (2) hide show

app.py +175 -0
sd_models.py +239 -0

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import gradio as gr
+from rf_models import RF_model
+from sd_models import SD_model
+import torch
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+import torch.nn.functional as F
+from diffusers import StableDiffusionXLImg2ImgPipeline
+import time
+import copy
+import numpy as np
+pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+pipe = pipe.to("cuda")
+global model
+global base_model
+global img
+def set_model(model_id):
+    global model
+    if model_id == "InstaFlow-0.9B":
+        model = RF_model("./instaflow_09b.pt")
+    elif model_id == "InstaFlow-1.7B":
+        model = RF_model("./instaflow_17b.pt")
+    else:
+        raise NotImplementedError
+    print('Finished Loading Model!')
+def set_base_model(model_id):
+    global base_model
+    if model_id == "runwayml/stable-diffusion-v1-5":
+        base_model = SD_model("runwayml/stable-diffusion-v1-5")
+    else:
+        raise NotImplementedError
+    print('Finished Loading Base Model!')
+def set_new_latent_and_generate_new_image(seed, prompt, negative_prompt="", num_inference_steps=1, guidance_scale=0.0):
+    print('Generate with input seed')
+    global model
+    global img
+    seed = int(seed)
+    num_inference_steps = int(num_inference_steps)
+    guidance_scale = float(guidance_scale)
+    print(seed, num_inference_steps, guidance_scale)
+    t_s = time.time()
+    new_image = model.set_new_latent_and_generate_new_image(int(seed), prompt, negative_prompt, int(num_inference_steps), guidance_scale)
+    #print('time consumption:', time.time() - t_s)
+    inf_time = time.time() - t_s
+    img = copy.copy(new_image[0])
+    return new_image[0], inf_time
+def set_new_latent_and_generate_new_image_with_base_model(seed, prompt, num_inference_steps=1, guidance_scale=0.0):
+    print('Generate with input seed')
+    global base_model
+    global img
+    negative_prompt=""
+    seed = int(seed)
+    num_inference_steps = int(num_inference_steps)
+    guidance_scale = float(guidance_scale)
+    print(seed, num_inference_steps, guidance_scale)
+    t_s = time.time()
+    new_image = base_model.set_new_latent_and_generate_new_image(int(seed), prompt, negative_prompt, int(num_inference_steps), guidance_scale)
+    #print('time consumption:', time.time() - t_s)
+    inf_time = time.time() - t_s
+    img = copy.copy(new_image[0])
+    return new_image[0], inf_time
+def set_new_latent_and_generate_new_image_and_random_seed(seed, prompt, negative_prompt="", num_inference_steps=1, guidance_scale=0.0):
+    print('Generate with a random seed')
+    global model
+    global img
+    seed = np.random.randint(0, 2**32)
+    num_inference_steps = int(num_inference_steps)
+    guidance_scale = float(guidance_scale)
+    print(seed, num_inference_steps, guidance_scale)
+    t_s = time.time()
+    new_image = model.set_new_latent_and_generate_new_image(int(seed), prompt, negative_prompt, int(num_inference_steps), guidance_scale)
+    #print('time consumption:', time.time() - t_s)
+    inf_time = time.time() - t_s
+    img = copy.copy(new_image[0])
+    return new_image[0], seed, inf_time
+def refine_image_512(prompt):
+    print('Refine with SDXL-Refiner (512)')
+    global img
+    t_s = time.time()
+    img = torch.tensor(img).unsqueeze(0).permute(0, 3, 1, 2)
+    img = img.permute(0, 2, 3, 1).squeeze(0).cpu().numpy()
+    new_image = pipe(prompt, image=img).images[0]
+    print('time consumption:', time.time() - t_s)
+    new_image = np.array(new_image) * 1.0 / 255.
+    img = new_image
+    return new_image
+def refine_image_1024(prompt):
+    print('Refine with SDXL-Refiner (1024)')
+    global img
+    t_s = time.time()
+    img = torch.tensor(img).unsqueeze(0).permute(0, 3, 1, 2)
+    img = torch.nn.functional.interpolate(img, size=1024, mode='bilinear')
+    img = img.permute(0, 2, 3, 1).squeeze(0).cpu().numpy()
+    new_image = pipe(prompt, image=img).images[0]
+    print('time consumption:', time.time() - t_s)
+    new_image = np.array(new_image) * 1.0 / 255.
+    img = new_image
+    return new_image
+set_model('InstaFlow-0.9B')
+set_base_model("runwayml/stable-diffusion-v1-5")
+with gr.Blocks() as gradio_gui:
+    gr.Markdown("Set Input Seed and Text Prompts Here")
+    with gr.Row():
+        with gr.Column(scale=0.4):
+            seed_input = gr.Textbox(value='101098274', label="Random Seed")
+        with gr.Column(scale=0.4):
+            prompt_input = gr.Textbox(value='A high-resolution photograph of a waterfall in autumn; muted tone', label="Prompt")
+    with gr.Row():
+        with gr.Column(scale=0.4):
+            with gr.Group():
+                gr.Markdown("Generation from InstaFlow-0.9B")
+                im = gr.Image()
+            gr.Markdown("Model ID: One-Step InstaFlow-0.9B")
+            inference_time_output = gr.Textbox(value='0.0', label='Inference Time with One-Step Model (Second)')
+            new_image_button = gr.Button(value="One-Step Generation with the Input Seed")
+            new_image_button.click(set_new_latent_and_generate_new_image, inputs=[seed_input, prompt_input], outputs=[im, inference_time_output])
+            next_image_button = gr.Button(value="One-Step Generation with a New Random Seed")
+            next_image_button.click(set_new_latent_and_generate_new_image_and_random_seed, inputs=[seed_input, prompt_input], outputs=[im, seed_input, inference_time_output])
+            refine_button_512 = gr.Button(value="Refine One-Step Generation with SDXL Refiner (Resolution: 512)")
+            refine_button_512.click(refine_image_512, inputs=[prompt_input], outputs=[im])
+            refine_button_1024 = gr.Button(value="Refine One-Step Generation with SDXL Refiner (Resolution: 1024)")
+            refine_button_1024.click(refine_image_1024, inputs=[prompt_input], outputs=[im])
+        with gr.Column(scale=0.4):
+            with gr.Group():
+                gr.Markdown("Generation from Stable Diffusion 1.5")
+                im_base = gr.Image()
+            gr.Markdown("Model ID: Multi-Step Stable Diffusion 1.5")
+            base_model_inference_time_output = gr.Textbox(value='0.0', label='Inference Time with Multi-Step Stable Diffusion (Second)')
+            num_inference_steps = gr.Textbox(value='25', label="Number of Inference Steps for Stable Diffusion")
+            guidance_scale = gr.Textbox(value='5.0', label="Guidance Scale for Stable Diffusion")
+            base_new_image_button = gr.Button(value="Multi-Step Generation with Stable Diffusion and the Input Seed")
+            base_new_image_button.click(set_new_latent_and_generate_new_image_with_base_model, inputs=[seed_input, prompt_input,  num_inference_steps, guidance_scale], outputs=[im_base, base_model_inference_time_output])
+gradio_gui.launch()

sd_models.py ADDED Viewed

	@@ -0,0 +1,239 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+from typing import Optional, Union, List, Callable
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from datasets import load_dataset
+from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel#, StackUNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate
+from diffusers.utils.import_utils import is_xformers_available
+import time
+from torch.distributions import Normal, Categorical
+from torch.distributions.multivariate_normal import MultivariateNormal
+from torch.distributions.mixture_same_family import MixtureSameFamily
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+import torchvision
+import cv2
+def inference_latent(
+    pipeline,
+    prompt: Union[str, List[str]],
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 7.5,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    num_images_per_prompt: Optional[int] = 1,
+    eta: float = 0.0,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+    callback_steps: Optional[int] = 1,
+):
+    # 0. Default height and width to unet
+    height = height or pipeline.unet.config.sample_size * pipeline.vae_scale_factor
+    width = width or pipeline.unet.config.sample_size * pipeline.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    #pipeline.check_inputs(prompt, height, width, callback_steps)
+    # 2. Define call parameters
+    batch_size = 1 if isinstance(prompt, str) else len(prompt)
+    device = pipeline._execution_device
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    do_classifier_free_guidance = guidance_scale > 1.0
+    # 3. Encode input prompt
+    #setup_seed(0)
+    text_embeddings = pipeline._encode_prompt(
+        prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+    )
+    # 4. Prepare timesteps
+    pipeline.scheduler.set_timesteps(num_inference_steps, device=device)
+    timesteps = pipeline.scheduler.timesteps
+    # 5. Prepare latent variables
+    num_channels_latents = pipeline.unet.in_channels
+    latents = latents.reshape(1, num_channels_latents, 64, 64)
+    # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+    extra_step_kwargs = pipeline.prepare_extra_step_kwargs(generator, eta)
+    # 7. Denoising loop
+    num_warmup_steps = len(timesteps) - \
+        num_inference_steps * pipeline.scheduler.order
+    latents_cllt = [latents.detach().clone()]
+    with torch.no_grad():
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = pipeline.scheduler.scale_model_input(
+                latent_model_input, t)
+            noise_pred = pipeline.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            outputs = pipeline.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs)
+            latents = outputs.prev_sample
+    example = {
+        'latent': latents.detach().clone(),
+        'text_embeddings': text_embeddings.chunk(2)[1].detach() if do_classifier_free_guidance else text_embeddings.detach(),
+    }
+    return example
+def setup_seed(seed):
+    import random
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    torch.cuda.empty_cache()
+class SD_model():
+    def __init__(self, pretrained_model_name_or_path):
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        # Load scheduler, tokenizer and models.
+        noise_scheduler = DDPMScheduler.from_pretrained(self.pretrained_model_name_or_path, subfolder="scheduler")
+        tokenizer = CLIPTokenizer.from_pretrained(
+            self.pretrained_model_name_or_path, subfolder="tokenizer"#, revision=args.revision
+        )
+        text_encoder = CLIPTextModel.from_pretrained(
+            self.pretrained_model_name_or_path, subfolder="text_encoder"#, revision=args.revision
+        )
+        vae = AutoencoderKL.from_pretrained(
+            self.pretrained_model_name_or_path, subfolder="vae"#, revision=args.revision
+        )
+        unet = UNet2DConditionModel.from_pretrained(
+            self.pretrained_model_name_or_path, subfolder="unet"#, revision=args.non_ema_revision
+        )
+        unet.eval()
+        vae.eval()
+        text_encoder.eval()
+        # Freeze vae and text_encoder
+        vae.requires_grad_(False)
+        text_encoder.requires_grad_(False)
+        unet.requires_grad_(False)
+        # For mixed precision training we cast the text_encoder and vae weights to half-precision
+        # as these models are only used for inference, keeping weights in full precision is not required.
+        weight_dtype = torch.float16
+        self.weight_dtype = weight_dtype
+        device = 'cuda'
+        self.device = device
+        # Move text_encode and vae to gpu and cast to weight_dtype
+        text_encoder.to(device, dtype=weight_dtype)
+        vae.to(device, dtype=weight_dtype)
+        unet.to(device, dtype=weight_dtype)
+        # Create the pipeline using the trained modules and save it.
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            self.pretrained_model_name_or_path,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = pipeline.to(device)
+        from diffusers import DPMSolverMultistepScheduler
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+        self.pipeline = pipeline
+    def set_new_latent_and_generate_new_image(self, seed=None, prompt=None, negative_prompt="", num_inference_steps=25, guidance_scale=5.0):
+        if seed is None:
+            assert False, "Must have a pre-defined random seed"
+        if prompt is None:
+            assert False, "Must have a user-specified text prompt"
+        setup_seed(seed)
+        self.latents = torch.randn((1, 4*64*64), device=self.device).to(dtype=self.weight_dtype)
+        self.prompt = prompt
+        self.negative_prompt = negative_prompt
+        self.guidance_scale = guidance_scale
+        self.num_inference_steps = num_inference_steps
+        prompts = [prompt]
+        negative_prompts = [negative_prompt]
+        output = inference_latent(
+            self.pipeline,
+            prompt=prompts,
+            negative_prompt=negative_prompts,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=self.guidance_scale,
+            latents=self.latents.detach().clone(),
+        )
+        image = self.pipeline.decode_latents(output['latent'])
+        self.org_image = image
+        return image