Spaces:

snap-research
/

weights2weights

Running on Zero

App Files Files Community

amildravid4292 commited on Jul 21, 2024

Commit

9f713c2

verified ·

1 Parent(s): fc54821

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -197

app.py CHANGED Viewed

@@ -6,210 +6,97 @@ from torch.utils.data import Dataset, DataLoader
 import gradio as gr
 import sys
 import tqdm
-import uuid
 sys.path.append(os.path.abspath(os.path.join("", "..")))
 import gc
 import warnings
 warnings.filterwarnings("ignore")
 from PIL import Image
 import numpy as np
 from editing import get_direction, debias
 from lora_w2w import LoRAw2w
 from huggingface_hub import snapshot_download
 import spaces
-from transformers import CLIPTextModel
-from lora_w2w import LoRAw2w
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel, LMSDiscreteScheduler
-from transformers import AutoTokenizer, PretrainedConfig
-import warnings
-warnings.filterwarnings("ignore")
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    UNet2DConditionModel,
-    PNDMScheduler,
-    StableDiffusionPipeline
-)
-device = gr.State("cuda")
-unet = gr.State()
-vae = gr.State()
-text_encoder = gr.State()
-tokenizer = gr.State()
-noise_scheduler = gr.State()
-network = gr.State()
-pretrained_model_name_or_path = "stablediffusionapi/realistic-vision-v51"
-revision = None
-rank = 1
-weight_dtype = torch.bfloat16
-# Load scheduler, tokenizer and models.
-pipe = StableDiffusionPipeline.from_pretrained("stablediffusionapi/realistic-vision-v51",
-                                            torch_dtype=torch.float16,safety_checker = None,
-                                            requires_safety_checker = False).to(device.value)
-noise_scheduler.value = pipe.scheduler
-del pipe
-tokenizer.value = AutoTokenizer.from_pretrained(
-        pretrained_model_name_or_path, subfolder="tokenizer", revision=revision
-    )
-text_encoder.value = CLIPTextModel.from_pretrained(
-        pretrained_model_name_or_path, subfolder="text_encoder", revision=revision
-    )
-vae.value = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", revision=revision)
-unet.value = UNet2DConditionModel.from_pretrained(
-        pretrained_model_name_or_path, subfolder="unet", revision=revision
-    )
-unet.value.requires_grad_(False)
-unet.value.to(device.value, dtype=weight_dtype)
-vae.value.requires_grad_(False)
-text_encoder.value.requires_grad_(False)
-vae.value.requires_grad_(False)
-vae.value.to(device.value, dtype=weight_dtype)
-text_encoder.value.to(device.value, dtype=weight_dtype)
-print("")
 models_path = snapshot_download(repo_id="Snapchat/w2w")
-mean = torch.load(f"{models_path}/files/mean.pt", map_location=torch.device('cpu')).bfloat16().to(device.value)
-std = torch.load(f"{models_path}/files/std.pt", map_location=torch.device('cpu')).bfloat16().to(device.value)
-v = torch.load(f"{models_path}/files/V.pt", map_location=torch.device('cpu')).bfloat16().to(device.value)
-proj = torch.load(f"{models_path}/files/proj_1000pc.pt", map_location=torch.device('cpu')).bfloat16().to(device.value)
 df = torch.load(f"{models_path}/files/identity_df.pt")
 weight_dimensions = torch.load(f"{models_path}/files/weight_dimensions.pt")
-pinverse = torch.load(f"{models_path}/files/pinverse_1000pc.pt", map_location=torch.device('cpu')).bfloat16().to(device.value)
-young = gr.State()
-young.value = get_direction(df, "Young", pinverse, 1000, device.value)
-young.value = debias(young.value, "Male", df, pinverse, device.value)
-young.value = debias(young.value, "Pointy_Nose", df, pinverse, device.value)
-young.value = debias(young.value, "Wavy_Hair", df, pinverse, device.value)
-young.value = debias(young.value, "Chubby", df, pinverse, device.value)
-young.value = debias(young.value, "No_Beard", df, pinverse, device.value)
-young.value = debias(young.value, "Mustache", df, pinverse, device.value)
-pointy = gr.State()
-pointy.value = get_direction(df, "Pointy_Nose", pinverse, 1000, device.value)
-pointy.value = debias(pointy.value, "Young", df, pinverse, device.value)
-pointy.value = debias(pointy.value, "Male", df, pinverse, device.value)
-pointy.value = debias(pointy.value, "Wavy_Hair", df, pinverse, device.value)
-pointy.value = debias(pointy.value, "Chubby", df, pinverse, device.value)
-pointy.value = debias(pointy.value, "Heavy_Makeup", df, pinverse, device.value)
-wavy = gr.State()
-wavy.value = get_direction(df, "Wavy_Hair", pinverse, 1000, device.value)
-wavy.value = debias(wavy.value, "Young", df, pinverse, device.value)
-wavy.value = debias(wavy.value, "Male", df, pinverse, device.value)
-wavy.value = debias(wavy.value, "Pointy_Nose", df, pinverse, device.value)
-wavy.value = debias(wavy.value, "Chubby", df, pinverse, device.value)
-wavy.value = debias(wavy.value, "Heavy_Makeup", df, pinverse, device.value)
-thick = gr.State()
-thick.value = get_direction(df, "Bushy_Eyebrows", pinverse, 1000, device.value)
-thick.value = debias(thick.value, "Male", df, pinverse, device.value)
-thick.value = debias(thick.value, "Young", df, pinverse, device.value)
-thick.value = debias(thick.value, "Pointy_Nose", df, pinverse, device.value)
-thick.value = debias(thick.value, "Wavy_Hair", df, pinverse, device.value)
-thick.value = debias(thick.value, "Mustache", df, pinverse, device.value)
-thick.value = debias(thick.value, "No_Beard", df, pinverse, device.value)
-thick.value = debias(thick.value, "Sideburns", df, pinverse, device.value)
-thick.value = debias(thick.value, "Big_Nose", df, pinverse, device.value)
-thick.value = debias(thick.value, "Big_Lips", df, pinverse, device.value)
-thick.value = debias(thick.value, "Black_Hair", df, pinverse, device.value)
-thick.value = debias(thick.value, "Brown_Hair", df, pinverse, device.value)
-thick.value = debias(thick.value, "Pale_Skin", df, pinverse, device.value)
-thick.value = debias(thick.value, "Heavy_Makeup", df, pinverse, device.value)
-@torch.no_grad()
-@spaces.GPU
-def sample_weights(unet, proj, mean, std, v, device, factor = 1.0):
-    # get mean and standard deviation for each principal component
-    m = torch.mean(proj, 0)
-    standev = torch.std(proj, 0)
-    del proj
-    torch.cuda.empty_cache()
-    # sample
-    sample = torch.zeros([1, 1000]).to(device)
-    for i in range(1000):
-        sample[0, i] = torch.normal(m[i], factor*standev[i], (1,1))
-    # load weights into network
-    net = LoRAw2w( sample, mean, std, v,
-                    unet,
-                    rank=1,
-                    multiplier=1.0,
-                    alpha=27.0,
-                    train_method="xattn-strict"
-                ).to(device, torch.bfloat16)
-    return net
-@torch.no_grad()
-@spaces.GPU
 def sample_model():
-    unet.value = UNet2DConditionModel.from_pretrained(
-        pretrained_model_name_or_path, subfolder="unet", revision=revision
-    )
-    unet.value.requires_grad_(False)
-    unet.value.to(device.value, dtype=weight_dtype)
-    network.value = sample_weights(unet.value, proj, mean, std, v[:, :1000], device.value, factor = 1.00)
 @torch.no_grad()
 @spaces.GPU
 def inference( prompt, negative_prompt, guidance_scale, ddim_steps, seed):
-    generator = torch.Generator(device=device.value).manual_seed(seed)
     latents = torch.randn(
-        (1, unet.value.in_channels, 512 // 8, 512 // 8),
         generator = generator,
-        device = device.value
     ).bfloat16()
-    text_input = tokenizer.value(prompt, padding="max_length", max_length=tokenizer.value.model_max_length, truncation=True, return_tensors="pt")
-    text_embeddings = text_encoder.value(text_input.input_ids.to(device.value))[0]
     max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer.value(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
-    uncond_embeddings = text_encoder.value(uncond_input.input_ids.to(device.value))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-    noise_scheduler.value.set_timesteps(ddim_steps)
-    latents = latents * noise_scheduler.value.init_noise_sigma
-    for i,t in enumerate(tqdm.tqdm(noise_scheduler.value.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
-        latent_model_input = noise_scheduler.value.scale_model_input(latent_model_input, timestep=t)
-        with network.value:
-            noise_pred = unet.value(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        latents = noise_scheduler.value.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
-    image = vae.value.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
@@ -221,67 +108,78 @@ def inference( prompt, negative_prompt, guidance_scale, ddim_steps, seed):
 @torch.no_grad()
 @spaces.GPU
 def edit_inference(prompt, negative_prompt, guidance_scale, ddim_steps, seed, start_noise, a1, a2, a3, a4):
-    original_weights = network.value.proj.clone()
     #pad to same number of PCs
     pcs_original = original_weights.shape[1]
-    pcs_edits = young.value.shape[1]
-    padding =  torch.zeros((1,pcs_original-pcs_edits)).to(device.value)
-    young_pad = torch.cat((young.value, padding), 1)
-    pointy_pad = torch.cat((pointy.value, padding), 1)
-    wavy_pad = torch.cat((wavy.value, padding), 1)
-    thick_pad = torch.cat((thick.value, padding), 1)
     edited_weights = original_weights+a1*1e6*young_pad+a2*1e6*pointy_pad+a3*1e6*wavy_pad+a4*2e6*thick_pad
-    generator = torch.Generator(device=device.value).manual_seed(seed)
     latents = torch.randn(
-        (1, unet.value.in_channels, 512 // 8, 512 // 8),
         generator = generator,
-        device = device.value
     ).bfloat16()
-    text_input = tokenizer.value(prompt, padding="max_length", max_length=tokenizer.value.model_max_length, truncation=True, return_tensors="pt")
-    text_embeddings = text_encoder.value(text_input.input_ids.to(device.value))[0]
     max_length = text_input.input_ids.shape[-1]
-    uncond_input = tokenizer.value(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
-    uncond_embeddings = text_encoder.value(uncond_input.input_ids.to(device.value))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-    noise_scheduler.value.set_timesteps(ddim_steps)
-    latents = latents * noise_scheduler.value.init_noise_sigma
-    for i,t in enumerate(tqdm.tqdm(noise_scheduler.value.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
-        latent_model_input = noise_scheduler.value.scale_model_input(latent_model_input, timestep=t)
         if t>start_noise:
             pass
         elif t<=start_noise:
-            network.value.proj = torch.nn.Parameter(edited_weights)
-            network.value.reset()
         with network:
-            noise_pred = unet.value(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        latents = noise_scheduler.value.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
-    image = vae.value.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
@@ -289,12 +187,11 @@ def edit_inference(prompt, negative_prompt, guidance_scale, ddim_steps, seed, st
     image = Image.fromarray((image * 255).round().astype("uint8"))
     #reset weights back to original
-    network.value.proj = torch.nn.Parameter(original_weights)
-    network.value.reset()
     return image
-@torch.no_grad()
 @spaces.GPU
 def sample_then_run():
     sample_model()
@@ -304,12 +201,52 @@ def sample_then_run():
     cfg = 3.0
     steps = 25
     image = inference( prompt, negative_prompt, cfg, steps, seed)
-    torch.save(network.value.proj.detach(), "model.pt" )
-    # net = torch.load("model.pt").cpu()
-    network.value.proj.detach().cpu()
-    return image, "model.pt", network.value #net   #, network.value.cpu()
 class CustomImageDataset(Dataset):
     def __init__(self, images, transform=None):
@@ -542,7 +479,7 @@ with gr.Blocks(css="style.css") as demo:
                     outputs = [input_image, file_output])
-        sample.click(fn=sample_then_run, outputs=[input_image, file_output, network])
         submit.click(
             fn=edit_inference, inputs=[prompt, negative_prompt, cfg, steps, seed, injection_step, a1, a2, a3, a4], outputs=[gallery]

 import gradio as gr
 import sys
 import tqdm
 sys.path.append(os.path.abspath(os.path.join("", "..")))
 import gc
 import warnings
 warnings.filterwarnings("ignore")
 from PIL import Image
 import numpy as np
+from utils import load_models
 from editing import get_direction, debias
+from sampling import sample_weights
 from lora_w2w import LoRAw2w
 from huggingface_hub import snapshot_download
 import spaces
+import uuid
+global device
+global generator
+global unet
+global vae
+global text_encoder
+global tokenizer
+global noise_scheduler
+global network
+device = "cuda"
+#generator = torch.Generator(device=device)
 models_path = snapshot_download(repo_id="Snapchat/w2w")
+mean = torch.load(f"{models_path}/files/mean.pt", map_location=torch.device('cpu')).bfloat16().to(device)
+std = torch.load(f"{models_path}/files/std.pt", map_location=torch.device('cpu')).bfloat16().to(device)
+v = torch.load(f"{models_path}/files/V.pt", map_location=torch.device('cpu')).bfloat16().to(device)
+proj = torch.load(f"{models_path}/files/proj_1000pc.pt", map_location=torch.device('cpu')).bfloat16().to(device)
 df = torch.load(f"{models_path}/files/identity_df.pt")
 weight_dimensions = torch.load(f"{models_path}/files/weight_dimensions.pt")
+pinverse = torch.load(f"{models_path}/files/pinverse_1000pc.pt", map_location=torch.device('cpu')).bfloat16().to(device)
+unet, vae, text_encoder, tokenizer, noise_scheduler = load_models(device)
 def sample_model():
+    global unet
+    del unet
+    global network
+    mean.to(device)
+    std.to(device)
+    v.to(device)
+    proj.to(device)
+    unet, _, _, _, _ = load_models(device)
+    network = sample_weights(unet, proj, mean, std, v[:, :1000], device, factor = 1.00)
 @torch.no_grad()
 @spaces.GPU
 def inference( prompt, negative_prompt, guidance_scale, ddim_steps, seed):
+    global device
+    #global generator
+    global unet
+    global vae
+    global text_encoder
+    global tokenizer
+    global noise_scheduler
+    generator = torch.Generator(device=device).manual_seed(seed)
     latents = torch.randn(
+        (1, unet.in_channels, 512 // 8, 512 // 8),
         generator = generator,
+        device = device
     ).bfloat16()
+    text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
     max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
+    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    noise_scheduler.set_timesteps(ddim_steps)
+    latents = latents * noise_scheduler.init_noise_sigma
+    for i,t in enumerate(tqdm.tqdm(noise_scheduler.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = noise_scheduler.scale_model_input(latent_model_input, timestep=t)
+        with network:
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
+    image = vae.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
 @torch.no_grad()
 @spaces.GPU
 def edit_inference(prompt, negative_prompt, guidance_scale, ddim_steps, seed, start_noise, a1, a2, a3, a4):
+    start_items()
+    global device
+    #global generator
+    global unet
+    global vae
+    global text_encoder
+    global tokenizer
+    global noise_scheduler
+    global young
+    global pointy
+    global wavy
+    global thick
+    original_weights = network.proj.clone()
     #pad to same number of PCs
     pcs_original = original_weights.shape[1]
+    pcs_edits = young.shape[1]
+    padding =  torch.zeros((1,pcs_original-pcs_edits)).to(device)
+    young_pad = torch.cat((young, padding), 1)
+    pointy_pad = torch.cat((pointy, padding), 1)
+    wavy_pad = torch.cat((wavy, padding), 1)
+    thick_pad = torch.cat((thick, padding), 1)
     edited_weights = original_weights+a1*1e6*young_pad+a2*1e6*pointy_pad+a3*1e6*wavy_pad+a4*2e6*thick_pad
+    generator = torch.Generator(device=device).manual_seed(seed)
     latents = torch.randn(
+        (1, unet.in_channels, 512 // 8, 512 // 8),
         generator = generator,
+        device = device
     ).bfloat16()
+    text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
     max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
                             [negative_prompt], padding="max_length", max_length=max_length, return_tensors="pt"
                         )
+    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
     text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    noise_scheduler.set_timesteps(ddim_steps)
+    latents = latents * noise_scheduler.init_noise_sigma
+    for i,t in enumerate(tqdm.tqdm(noise_scheduler.timesteps)):
         latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = noise_scheduler.scale_model_input(latent_model_input, timestep=t)
         if t>start_noise:
             pass
         elif t<=start_noise:
+            network.proj = torch.nn.Parameter(edited_weights)
+            network.reset()
         with network:
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings, timestep_cond= None).sample
         #guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
     latents = 1 / 0.18215 * latents
+    image = vae.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.detach().cpu().float().permute(0, 2, 3, 1).numpy()[0]
     image = Image.fromarray((image * 255).round().astype("uint8"))
     #reset weights back to original
+    network.proj = torch.nn.Parameter(original_weights)
+    network.reset()
     return image
 @spaces.GPU
 def sample_then_run():
     sample_model()
     cfg = 3.0
     steps = 25
     image = inference( prompt, negative_prompt, cfg, steps, seed)
+    torch.save(network.proj, "model.pt" )
+    return image, "model.pt"
+#@spaces.GPU
+def start_items():
+    print("Starting items")
+    global young
+    global pointy
+    global wavy
+    global thick
+    young = get_direction(df, "Young", pinverse, 1000, device)
+    young = debias(young, "Male", df, pinverse, device)
+    young = debias(young, "Pointy_Nose", df, pinverse, device)
+    young = debias(young, "Wavy_Hair", df, pinverse, device)
+    young = debias(young, "Chubby", df, pinverse, device)
+    young = debias(young, "No_Beard", df, pinverse, device)
+    young = debias(young, "Mustache", df, pinverse, device)
+    pointy = get_direction(df, "Pointy_Nose", pinverse, 1000, device)
+    pointy = debias(pointy, "Young", df, pinverse, device)
+    pointy = debias(pointy, "Male", df, pinverse, device)
+    pointy = debias(pointy, "Wavy_Hair", df, pinverse, device)
+    pointy = debias(pointy, "Chubby", df, pinverse, device)
+    pointy = debias(pointy, "Heavy_Makeup", df, pinverse, device)
+    wavy = get_direction(df, "Wavy_Hair", pinverse, 1000, device)
+    wavy = debias(wavy, "Young", df, pinverse, device)
+    wavy = debias(wavy, "Male", df, pinverse, device)
+    wavy = debias(wavy, "Pointy_Nose", df, pinverse, device)
+    wavy = debias(wavy, "Chubby", df, pinverse, device)
+    wavy = debias(wavy, "Heavy_Makeup", df, pinverse, device)
+    thick = get_direction(df, "Bushy_Eyebrows", pinverse, 1000, device)
+    thick = debias(thick, "Male", df, pinverse, device)
+    thick = debias(thick, "Young", df, pinverse, device)
+    thick = debias(thick, "Pointy_Nose", df, pinverse, device)
+    thick = debias(thick, "Wavy_Hair", df, pinverse, device)
+    thick = debias(thick, "Mustache", df, pinverse, device)
+    thick = debias(thick, "No_Beard", df, pinverse, device)
+    thick = debias(thick, "Sideburns", df, pinverse, device)
+    thick = debias(thick, "Big_Nose", df, pinverse, device)
+    thick = debias(thick, "Big_Lips", df, pinverse, device)
+    thick = debias(thick, "Black_Hair", df, pinverse, device)
+    thick = debias(thick, "Brown_Hair", df, pinverse, device)
+    thick = debias(thick, "Pale_Skin", df, pinverse, device)
+    thick = debias(thick, "Heavy_Makeup", df, pinverse, device)
 class CustomImageDataset(Dataset):
     def __init__(self, images, transform=None):
                     outputs = [input_image, file_output])
+        sample.click(fn=sample_then_run, outputs=[input_image, file_output])
         submit.click(
             fn=edit_inference, inputs=[prompt, negative_prompt, cfg, steps, seed, injection_step, a1, a2, a3, a4], outputs=[gallery]