File size: 2,340 Bytes
deff113
 
 
 
 
 
 
 
78aed89
c483e0a
 
78aed89
 
 
deff113
 
 
 
 
 
 
 
78aed89
 
 
 
deff113
 
 
 
 
 
 
 
 
 
78aed89
 
 
deff113
 
5121d42
 
 
 
 
 
 
deff113
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
from gradio.components import *
import torch
from torch import autocast
from torchvision import transforms as T

from boomerang import *


COLAB = False
HALF_PRECISION = COLAB



def main(image, prompt, percent_noise): # percent_noise = 0.5, 0.02, 0.999

  # Convert image to float and preprocess it.
  # From huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py
  transform = T.Compose([T.PILToTensor(),
                        T.ConvertImageDtype(torch.float),
                        T.Normalize([0.5], [0.5])])
  
  if HALF_PRECISION:
    tensor = transform(image).half().to(pipe.device)
  else:
    tensor = transform(image).to(pipe.device)
  tensor = torch.unsqueeze(tensor, 0)

  # Project image into the latent space
  clean_z = pipe.vae.encode(tensor).latent_dist.mode() # From huggingface/diffusers/blob/main/src/diffusers/models/vae.py
  clean_z = 0.18215 * clean_z

  # Add noise to the latent variable
  # (this is the forward diffusion process)
  noise = torch.randn(clean_z.shape).to(pipe.device)
  timestep = torch.Tensor([int(pipe.scheduler.config.num_train_timesteps * percent_noise)]).to(pipe.device).long()
  z = pipe.scheduler.add_noise(clean_z, noise, timestep)
  if HALF_PRECISION:
    z = z.half()

  # Run the diffusion model
  #with autocast('cuda'):
  # The 'num_inference_steps=100' arguments means that, if you were running
  # the full diffusion model (i.e., percent_noise = 0.999), it would be sampling
  # at 2x frequency compared to the normal stable diffusion model (which uses 50 steps).
  # This way, running percent_noise = 0.5 yields 50 inference steps,
  # and running percent_noise = 0.2 yields 20 inference steps, etc.
  return pipe(prompt=prompt, latents=z, num_inference_steps=100, percent_noise=percent_noise).images[0]
    
    
    
    
  
  
gr.Interface(fn=main, 
             inputs=[gr.Image(type="pil", shape=(512, 512)), "text", gr.Slider(0.02, 0.999, value=0.7, label='percent noise')],
             outputs=gr.Image(type="pil", shape=(512,512)),
             examples=[["original.png", "person", 0.7],
                       ['cat.png', 'cat', 0.9],
                       ['bedroom.jpg', 'bathroom', 0.8],
                       ['einstein.jpg', 'dog', 0.8],
                       ['oprah.jpeg', 'pirate', 0.8]]).launch()