latentnavigation-flux

Running on Zero

App Files Files Community

support flux

by linoyts HF staff - opened Aug 21, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+212

-2101

Files changed (2) hide show

app.py +0 -0
clip_slider_pipeline.py +171 -75

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

clip_slider_pipeline.py CHANGED Viewed

@@ -4,26 +4,23 @@ import random
 from tqdm import tqdm
 from constants import SUBJECTS, MEDIUMS
 from PIL import Image
-import time
 class CLIPSlider:
     def __init__(
             self,
             sd_pipe,
             device: torch.device,
-            target_word: str = "",
-            opposite: str = "",
             target_word_2nd: str = "",
             opposite_2nd: str = "",
             iterations: int = 300,
     ):
         self.device = device
-        self.pipe = sd_pipe.to(self.device, torch.float16)
         self.iterations = iterations
-        if target_word != "" or opposite != "":
-            self.avg_diff = self.find_latent_direction(target_word, opposite)
-        else:
-            self.avg_diff = None
         if target_word_2nd != "" or opposite_2nd != "":
             self.avg_diff_2nd = self.find_latent_direction(target_word_2nd, opposite_2nd)
         else:
@@ -32,21 +29,17 @@ class CLIPSlider:
     def find_latent_direction(self,
                               target_word:str,
-                              opposite:str,
-                            num_iterations: int = None):
         # lets identify a latent direction by taking differences between opposites
         # target_word = "happy"
         # opposite = "sad"
-        if num_iterations is not None:
-            iterations = num_iterations
-        else:
-            iterations = self.iterations
         with torch.no_grad():
             positives = []
             negatives = []
-            for i in tqdm(range(iterations)):
                 medium = random.choice(MEDIUMS)
                 subject = random.choice(SUBJECTS)
                 pos_prompt = f"a {medium} of a {target_word} {subject}"
@@ -77,8 +70,6 @@ class CLIPSlider:
         only_pooler = False,
         normalize_scales = False, # whether to normalize the scales when avg_diff_2nd is not None
         correlation_weight_factor = 1.0,
-        avg_diff = None,
-        avg_diff_2nd = None,
         **pipeline_kwargs
         ):
         # if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true
@@ -89,14 +80,14 @@ class CLIPSlider:
                                   max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda()
         prompt_embeds = self.pipe.text_encoder(toks).last_hidden_state
-        if avg_diff_2nd and normalize_scales:
             denominator = abs(scale) + abs(scale_2nd)
             scale = scale / denominator
             scale_2nd = scale_2nd / denominator
         if only_pooler:
-            prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + avg_diff * scale
-            if avg_diff_2nd:
-                prompt_embeds[:, toks.argmax()] += avg_diff_2nd * scale_2nd
         else:
             normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
         sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T
@@ -108,15 +99,15 @@ class CLIPSlider:
         # weights = torch.sigmoid((weights-0.5)*7)
         prompt_embeds = prompt_embeds + (
-                    weights * avg_diff[None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale)
-        if avg_diff_2nd:
-            prompt_embeds += weights * avg_diff_2nd[None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale_2nd
         torch.manual_seed(seed)
-        image = self.pipe(prompt_embeds=prompt_embeds, **pipeline_kwargs).images[0]
-        return image
     def spectrum(self,
                  prompt="a photo of a house",
@@ -149,23 +140,19 @@ class CLIPSliderXL(CLIPSlider):
     def find_latent_direction(self,
                               target_word:str,
-                              opposite:str,
-                             num_iterations: int = None):
         # lets identify a latent direction by taking differences between opposites
         # target_word = "happy"
         # opposite = "sad"
-        if num_iterations is not None:
-            iterations = num_iterations
-        else:
-            iterations = self.iterations
         with torch.no_grad():
             positives = []
             negatives = []
             positives2 = []
             negatives2 = []
-            for i in tqdm(range(iterations)):
                 medium = random.choice(MEDIUMS)
                 subject = random.choice(SUBJECTS)
                 pos_prompt = f"a {medium} of a {target_word} {subject}"
@@ -208,13 +195,11 @@ class CLIPSliderXL(CLIPSlider):
         only_pooler = False,
         normalize_scales = False,
         correlation_weight_factor = 1.0,
-        avg_diff = None,
-        avg_diff_2nd = None,
         **pipeline_kwargs
         ):
         # if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true
         # if pooler token only [-4,4] work well
-        start_time = time.time()
         text_encoders = [self.pipe.text_encoder, self.pipe.text_encoder_2]
         tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2]
         with torch.no_grad():
@@ -239,21 +224,20 @@ class CLIPSliderXL(CLIPSlider):
                     toks.to(text_encoder.device),
                     output_hidden_states=True,
                 )
                 # We are only ALWAYS interested in the pooled output of the final text encoder
-                pooled_prompt_embeds = prompt_embeds[0]
                 prompt_embeds = prompt_embeds.hidden_states[-2]
-                print("prompt_embeds.dtype",prompt_embeds.dtype)
-                if avg_diff_2nd and normalize_scales:
                     denominator = abs(scale) + abs(scale_2nd)
                     scale = scale / denominator
                     scale_2nd = scale_2nd / denominator
                 if only_pooler:
-                    prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + avg_diff[0] * scale
-                    if avg_diff_2nd:
-                        prompt_embeds[:, toks.argmax()] += avg_diff_2nd[0] * scale_2nd
                 else:
                     normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
                     sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T
@@ -263,58 +247,49 @@ class CLIPSliderXL(CLIPSlider):
                         standard_weights = torch.ones_like(weights)
                         weights = standard_weights + (weights - standard_weights) * correlation_weight_factor
-                        prompt_embeds = prompt_embeds + (weights * avg_diff[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale)
-                        if avg_diff_2nd:
-                            prompt_embeds += (weights * avg_diff_2nd[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale_2nd)
                     else:
                         weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 1280)
                         standard_weights = torch.ones_like(weights)
                         weights = standard_weights + (weights - standard_weights) * correlation_weight_factor
-                        prompt_embeds = prompt_embeds + (weights * avg_diff[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale)
-                        if avg_diff_2nd:
-                            prompt_embeds += (weights * avg_diff_2nd[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale_2nd)
                 bs_embed, seq_len, _ = prompt_embeds.shape
                 prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
                 prompt_embeds_list.append(prompt_embeds)
-            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1).to(torch.float16)
-            pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1).to(torch.float16)
-            end_time = time.time()
-            print("prompt_embeds", prompt_embeds.dtype)
-            print(f"generation time - before pipe: {end_time - start_time:.2f} ms")
             torch.manual_seed(seed)
-            start_time = time.time()
-            image = self.pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
-                             **pipeline_kwargs).images[0]
-            end_time = time.time()
-            print(f"generation time - pipe: {end_time - start_time:.2f} ms")
-        return image
 class CLIPSliderXL_inv(CLIPSlider):
     def find_latent_direction(self,
                               target_word:str,
-                              opposite:str,
-                              num_iterations: int = None):
         # lets identify a latent direction by taking differences between opposites
         # target_word = "happy"
         # opposite = "sad"
-        if num_iterations is not None:
-            iterations = num_iterations
-        else:
-            iterations = self.iterations
         with torch.no_grad():
             positives = []
             negatives = []
             positives2 = []
             negatives2 = []
-            for i in tqdm(range(iterations)):
                 medium = random.choice(MEDIUMS)
                 subject = random.choice(SUBJECTS)
                 pos_prompt = f"a {medium} of a {target_word} {subject}"
@@ -357,18 +332,139 @@ class CLIPSliderXL_inv(CLIPSlider):
         only_pooler = False,
         normalize_scales = False,
         correlation_weight_factor = 1.0,
-        avg_diff=None,
-        avg_diff_2nd=None,
-        init_latents=None,
-        zs=None,
         **pipeline_kwargs
         ):
         with torch.no_grad():
             torch.manual_seed(seed)
-            images = self.pipe(editing_prompt=prompt, init_latents=init_latents, zs=zs,
-                               avg_diff=avg_diff[0], avg_diff_2=avg_diff[1],
-                               scale=scale,
                                **pipeline_kwargs).images
         return images

 from tqdm import tqdm
 from constants import SUBJECTS, MEDIUMS
 from PIL import Image
 class CLIPSlider:
     def __init__(
             self,
             sd_pipe,
             device: torch.device,
+            target_word: str,
+            opposite: str,
             target_word_2nd: str = "",
             opposite_2nd: str = "",
             iterations: int = 300,
     ):
         self.device = device
+        self.pipe = sd_pipe.to(self.device)
         self.iterations = iterations
+        self.avg_diff = self.find_latent_direction(target_word, opposite)
         if target_word_2nd != "" or opposite_2nd != "":
             self.avg_diff_2nd = self.find_latent_direction(target_word_2nd, opposite_2nd)
         else:
     def find_latent_direction(self,
                               target_word:str,
+                              opposite:str):
         # lets identify a latent direction by taking differences between opposites
         # target_word = "happy"
         # opposite = "sad"
         with torch.no_grad():
             positives = []
             negatives = []
+            for i in tqdm(range(self.iterations)):
                 medium = random.choice(MEDIUMS)
                 subject = random.choice(SUBJECTS)
                 pos_prompt = f"a {medium} of a {target_word} {subject}"
         only_pooler = False,
         normalize_scales = False, # whether to normalize the scales when avg_diff_2nd is not None
         correlation_weight_factor = 1.0,
         **pipeline_kwargs
         ):
         # if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true
                                   max_length=self.pipe.tokenizer.model_max_length).input_ids.cuda()
         prompt_embeds = self.pipe.text_encoder(toks).last_hidden_state
+        if self.avg_diff_2nd and normalize_scales:
             denominator = abs(scale) + abs(scale_2nd)
             scale = scale / denominator
             scale_2nd = scale_2nd / denominator
         if only_pooler:
+            prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + self.avg_diff * scale
+            if self.avg_diff_2nd:
+                prompt_embeds[:, toks.argmax()] += self.avg_diff_2nd * scale_2nd
         else:
             normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
         sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T
         # weights = torch.sigmoid((weights-0.5)*7)
         prompt_embeds = prompt_embeds + (
+                    weights * self.avg_diff[None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale)
+        if self.avg_diff_2nd:
+            prompt_embeds += weights * self.avg_diff_2nd[None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale_2nd
         torch.manual_seed(seed)
+        images = self.pipe(prompt_embeds=prompt_embeds, **pipeline_kwargs).images
+        return images
     def spectrum(self,
                  prompt="a photo of a house",
     def find_latent_direction(self,
                               target_word:str,
+                              opposite:str):
         # lets identify a latent direction by taking differences between opposites
         # target_word = "happy"
         # opposite = "sad"
         with torch.no_grad():
             positives = []
             negatives = []
             positives2 = []
             negatives2 = []
+            for i in tqdm(range(self.iterations)):
                 medium = random.choice(MEDIUMS)
                 subject = random.choice(SUBJECTS)
                 pos_prompt = f"a {medium} of a {target_word} {subject}"
         only_pooler = False,
         normalize_scales = False,
         correlation_weight_factor = 1.0,
         **pipeline_kwargs
         ):
         # if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true
         # if pooler token only [-4,4] work well
         text_encoders = [self.pipe.text_encoder, self.pipe.text_encoder_2]
         tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2]
         with torch.no_grad():
                     toks.to(text_encoder.device),
                     output_hidden_states=True,
                 )
                 # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
                 prompt_embeds = prompt_embeds.hidden_states[-2]
+                if self.avg_diff_2nd and normalize_scales:
                     denominator = abs(scale) + abs(scale_2nd)
                     scale = scale / denominator
                     scale_2nd = scale_2nd / denominator
                 if only_pooler:
+                    prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + self.avg_diff[0] * scale
+                    if self.avg_diff_2nd:
+                        prompt_embeds[:, toks.argmax()] += self.avg_diff_2nd[0] * scale_2nd
                 else:
                     normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
                     sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T
                         standard_weights = torch.ones_like(weights)
                         weights = standard_weights + (weights - standard_weights) * correlation_weight_factor
+                        prompt_embeds = prompt_embeds + (weights * self.avg_diff[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale)
+                        if self.avg_diff_2nd:
+                            prompt_embeds += (weights * self.avg_diff_2nd[0][None, :].repeat(1, self.pipe.tokenizer.model_max_length, 1) * scale_2nd)
                     else:
                         weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, 1280)
                         standard_weights = torch.ones_like(weights)
                         weights = standard_weights + (weights - standard_weights) * correlation_weight_factor
+                        prompt_embeds = prompt_embeds + (weights * self.avg_diff[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale)
+                        if self.avg_diff_2nd:
+                            prompt_embeds += (weights * self.avg_diff_2nd[1][None, :].repeat(1, self.pipe.tokenizer_2.model_max_length, 1) * scale_2nd)
                 bs_embed, seq_len, _ = prompt_embeds.shape
                 prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
                 prompt_embeds_list.append(prompt_embeds)
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+            pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
             torch.manual_seed(seed)
+            images = self.pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
+                         **pipeline_kwargs).images
+        return images
 class CLIPSliderXL_inv(CLIPSlider):
     def find_latent_direction(self,
                               target_word:str,
+                              opposite:str):
         # lets identify a latent direction by taking differences between opposites
         # target_word = "happy"
         # opposite = "sad"
         with torch.no_grad():
             positives = []
             negatives = []
             positives2 = []
             negatives2 = []
+            for i in tqdm(range(self.iterations)):
                 medium = random.choice(MEDIUMS)
                 subject = random.choice(SUBJECTS)
                 pos_prompt = f"a {medium} of a {target_word} {subject}"
         only_pooler = False,
         normalize_scales = False,
         correlation_weight_factor = 1.0,
         **pipeline_kwargs
         ):
         with torch.no_grad():
             torch.manual_seed(seed)
+            images = self.pipe(editing_prompt=prompt,
+                               avg_diff=self.avg_diff, avg_diff_2nd=self.avg_diff_2nd,
+                               scale=scale, scale_2nd=scale_2nd,
                                **pipeline_kwargs).images
         return images
+class T5SliderFlux(CLIPSlider):
+    def find_latent_direction(self,
+                              target_word:str,
+                              opposite:str):
+        # lets identify a latent direction by taking differences between opposites
+        # target_word = "happy"
+        # opposite = "sad"
+        with torch.no_grad():
+            positives = []
+            negatives = []
+            for i in tqdm(range(self.iterations)):
+                medium = random.choice(MEDIUMS)
+                subject = random.choice(SUBJECTS)
+                pos_prompt = f"a {medium} of a {target_word} {subject}"
+                neg_prompt = f"a {medium} of a {opposite} {subject}"
+                pos_toks = self.pipe.tokenizer_2(pos_prompt,
+                                                 return_tensors="pt",
+                                                 padding="max_length",
+                                                 truncation=True,
+                                                 return_length=False,
+                                                 return_overflowing_tokens=False,
+                                                 max_length=self.pipe.tokenizer_2.model_max_length).input_ids.cuda()
+                neg_toks = self.pipe.tokenizer_2(neg_prompt,
+                                                 return_tensors="pt",
+                                                 padding="max_length",
+                                                 truncation=True,
+                                                 return_length=False,
+                                                 return_overflowing_tokens=False,
+                                                 max_length=self.pipe.tokenizer_2.model_max_length).input_ids.cuda()
+                pos = self.pipe.text_encoder_2(pos_toks, output_hidden_states=False)[0]
+                neg = self.pipe.text_encoder_2(neg_toks, output_hidden_states=False)[0]
+                positives.append(pos)
+                negatives.append(neg)
+        positives = torch.cat(positives, dim=0)
+        negatives = torch.cat(negatives, dim=0)
+        diffs = positives - negatives
+        avg_diff = diffs.mean(0, keepdim=True)
+        return avg_diff
+    def generate(self,
+        prompt = "a photo of a house",
+        scale = 2,
+        scale_2nd = 2,
+        seed = 15,
+        only_pooler = False,
+        normalize_scales = False,
+        correlation_weight_factor = 1.0,
+        **pipeline_kwargs
+        ):
+        # if doing full sequence, [-0.3,0.3] work well, higher if correlation weighted is true
+        # if pooler token only [-4,4] work well
+        with torch.no_grad():
+            text_inputs = self.pipe.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_overflowing_tokens=False,
+                return_length=False,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = self.pipe.text_encoder(text_input_ids.to(self.device), output_hidden_states=False)
+            # Use pooled output of CLIPTextModel
+            prompt_embeds = prompt_embeds.pooler_output
+            pooled_prompt_embeds = prompt_embeds.to(dtype=self.pipe.text_encoder.dtype, device=self.device)
+            # Use pooled output of CLIPTextModel
+            text_inputs = self.pipe.tokenizer_2(
+                prompt,
+                padding="max_length",
+                max_length=512,
+                truncation=True,
+                return_length=False,
+                return_overflowing_tokens=False,
+                return_tensors="pt",
+            )
+            toks = text_inputs.input_ids
+            prompt_embeds = self.pipe.text_encoder_2(toks.to(self.device), output_hidden_states=False)[0]
+            dtype = self.pipe.text_encoder_2.dtype
+            prompt_embeds = prompt_embeds.to(dtype=dtype, device=self.device)
+            print("1", prompt_embeds.shape)
+            if self.avg_diff_2nd and normalize_scales:
+                denominator = abs(scale) + abs(scale_2nd)
+                scale = scale / denominator
+                scale_2nd = scale_2nd / denominator
+            if only_pooler:
+                prompt_embeds[:, toks.argmax()] = prompt_embeds[:, toks.argmax()] + self.avg_diff * scale
+                if self.avg_diff_2nd:
+                    prompt_embeds[:, toks.argmax()] += self.avg_diff_2nd * scale_2nd
+            else:
+                normed_prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
+                sims = normed_prompt_embeds[0] @ normed_prompt_embeds[0].T
+                weights = sims[toks.argmax(), :][None, :, None].repeat(1, 1, prompt_embeds.shape[2])
+                print("weights", weights.shape)
+                standard_weights = torch.ones_like(weights)
+                weights = standard_weights + (weights - standard_weights) * correlation_weight_factor
+                prompt_embeds = prompt_embeds + (
+                            weights * self.avg_diff * scale)
+                print("2", prompt_embeds.shape)
+                if self.avg_diff_2nd:
+                    prompt_embeds += (
+                                weights * self.avg_diff_2nd * scale_2nd)
+            torch.manual_seed(seed)
+            images = self.pipe(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
+                               **pipeline_kwargs).images
+        return images