jimmycarter
/

LibreFLUX

Text-to-Image

Diffusers

Safetensors

FluxPipeline

Model card Files Files and versions Community

jimmycarter commited on Oct 20, 2024

Commit

7d024b3

verified ·

1 Parent(s): 39f4661

Upload 2 files

Browse files

Files changed (2) hide show

README.md +8 -2
pipeline.py +83 -56

README.md CHANGED Viewed

@@ -49,8 +49,9 @@ negative_prompt = "blurry"
 images = pipe(
   prompt=prompt,
   negative_prompt=negative_prompt,
 )
-images[0].save('chalkboard.png')
 # If you have <=24 GB VRAM, try:
 # ! pip install optimum-quanto
@@ -67,14 +68,19 @@ quantize(
 )
 freeze(pipe.transformer)
 pipe.enable_model_cpu_offload()
 images = pipe(
   prompt=prompt,
   negative_prompt=negative_prompt,
   device=None,
 )
-images[0].save('chalkboard.png')
 ```
 # Non-technical Report on Schnell De-distillation
 Welcome to my non-technical report on de-distilling FLUX.1-schnell in the most un-scientific way possible with extremely limited resources. I'm not going to claim I made a good model, but I did make a model. It was trained on about 1,500 H100 hour equivalents.

 images = pipe(
   prompt=prompt,
   negative_prompt=negative_prompt,
+  return_dict=False,
 )
+images[0][0].save('chalkboard.png')
 # If you have <=24 GB VRAM, try:
 # ! pip install optimum-quanto
 )
 freeze(pipe.transformer)
 pipe.enable_model_cpu_offload()
+# If you are still running out of memory, add do_batch_cfg=False below.
 images = pipe(
   prompt=prompt,
   negative_prompt=negative_prompt,
   device=None,
+  return_dict=False,
 )
+images[0][0].save('chalkboard.png')
 ```
+For usage in ComfyUI, [a single transformer file is provided](https://huggingface.co/jimmycarter/LibreFLUX/blob/main/transformer_legacy.safetensors) but note that ComfyUI does not presently support attention masks so your images may be degraded.
 # Non-technical Report on Schnell De-distillation
 Welcome to my non-technical report on de-distilling FLUX.1-schnell in the most un-scientific way possible with extremely limited resources. I'm not going to claim I made a good model, but I did make a model. It was trained on about 1,500 H100 hour equivalents.

pipeline.py CHANGED Viewed

@@ -1376,8 +1376,7 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         no_cfg_until_timestep: int = 0,
-        use_prompt_mask: bool = True,
-        zero_using_prompt_mask: bool = False,
         device=torch.device('cuda'), # TODO let this work with non-cuda stuff? Might if you set this to None
     ):
         r"""
@@ -1510,6 +1509,7 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
         )
         if _prompt_mask is not None:
             prompt_mask = _prompt_mask
         if negative_prompt_2 == "" and negative_prompt != "":
             negative_prompt_2 = negative_prompt
@@ -1537,6 +1537,8 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
             if _neg_prompt_mask is not None:
                 negative_mask = _neg_prompt_mask
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(
@@ -1601,56 +1603,63 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
                 if self.interrupt:
                     continue
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                assert prompt_mask is not None
                 extra_transformer_args = {}
-                if use_prompt_mask and prompt_mask is not None and not zero_using_prompt_mask:
-                    extra_transformer_args["attention_mask"] = prompt_mask
-                elif use_prompt_mask and prompt_mask is not None and zero_using_prompt_mask:
-                    mask_tens = prompt_mask.unsqueeze(-1).to(device=prompt_embeds.device, dtype=prompt_embeds.dtype)
-                    prompt_embeds = prompt_embeds * mask_tens
                 noise_pred = self.transformer(
-                    hidden_states=latents,
-                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
                     timestep=timestep / 1000,
                     guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids.to(device=device),
                     joint_attention_kwargs=self.joint_attention_kwargs,
                     return_dict=False,
                     **extra_transformer_args,
                 )[0]
-                # TODO optionally use batch prediction to speed this up.
-                if self._guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
-                    extra_transformer_args_neg = {}
-                    if negative_mask is not None:
-                        extra_transformer_args_neg["attention_mask"] = negative_mask
-                    extra_transformer_args_neg["attention_mask"] is not None
-                    noise_pred_uncond = self.transformer(
-                        hidden_states=latents,
-                        # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        pooled_projections=negative_pooled_prompt_embeds,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        txt_ids=negative_text_ids,
-                        img_ids=latent_image_ids.to(device=device),
-                        joint_attention_kwargs=self.joint_attention_kwargs,
-                        return_dict=False,
-                        **extra_transformer_args_neg,
-                    )[0]
-                    noise_pred = noise_pred_uncond + self._guidance_scale_real * (
-                        noise_pred - noise_pred_uncond
-                    )
                     progress_bar.set_postfix(
                         {
                             'ts': timestep.detach().item() / 1000,
@@ -1665,32 +1674,50 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
                         },
                     )
-                # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, return_dict=False
-                )[0]
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
-                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
                         latents = latents.to(latents_dtype)
                 if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
-                ):
                     progress_bar.update()
                 if XLA_AVAILABLE:
                     xm.mark_step()

         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         no_cfg_until_timestep: int = 0,
+        do_batch_cfg: bool=True,
         device=torch.device('cuda'), # TODO let this work with non-cuda stuff? Might if you set this to None
     ):
         r"""
         )
         if _prompt_mask is not None:
             prompt_mask = _prompt_mask
+        assert prompt_mask is not None
         if negative_prompt_2 == "" and negative_prompt != "":
             negative_prompt_2 = negative_prompt
             if _neg_prompt_mask is not None:
                 negative_mask = _neg_prompt_mask
+        assert negative_mask is not None
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(
                 if self.interrupt:
                     continue
+                # Prepare the latent model input
+                prompt_embeds_input = prompt_embeds
+                pooled_prompt_embeds_input = pooled_prompt_embeds
+                text_ids_input = text_ids
+                latent_image_ids_input = latent_image_ids
+                prompt_mask_input = prompt_mask
+                latent_model_input = latents
+                if do_batch_cfg and guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
+                    # Concatenate prompt embeddings
+                    prompt_embeds_input = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+                    pooled_prompt_embeds_input = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+                    # # Concatenate text IDs if they are used
+                    # if text_ids is not None and negative_text_ids is not None:
+                    #     text_ids_input = torch.cat([negative_text_ids, text_ids], dim=0)
+                    # Concatenate latent image IDs if they are used
+                    # if latent_image_ids is not None:
+                    #     latent_image_ids_input = torch.cat([latent_image_ids, latent_image_ids], dim=0)
+                    # Concatenate prompt masks if they are used
+                    if prompt_mask is not None and negative_mask is not None:
+                        prompt_mask_input = torch.cat([negative_mask, prompt_mask], dim=0)
+                    # Duplicate latents for unconditional and conditional inputs
+                    latent_model_input = torch.cat([latents] * 2)
+                # Expand timestep to match batch size
+                timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
+                # Handle guidance
+                if self.transformer.config.guidance_embeds:
+                    guidance = torch.tensor([guidance_scale], device=self.transformer.device)
+                    guidance = guidance.expand(latent_model_input.shape[0])
+                else:
+                    guidance = None
+                # Prepare extra transformer arguments
                 extra_transformer_args = {}
+                if prompt_mask is not None:
+                    extra_transformer_args["attention_mask"] = prompt_mask_input.to(device=self.transformer.device)
+                # Forward pass through the transformer
                 noise_pred = self.transformer(
+                    hidden_states=latent_model_input.to(device=self.transformer.device),
                     timestep=timestep / 1000,
                     guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
+                    encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
+                    txt_ids=text_ids_input.to(device=self.transformer.device) if text_ids is not None else None,
+                    img_ids=latent_image_ids_input.to(device=self.transformer.device) if latent_image_ids is not None else None,
                     joint_attention_kwargs=self.joint_attention_kwargs,
                     return_dict=False,
                     **extra_transformer_args,
                 )[0]
+                if do_batch_cfg and guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
                     progress_bar.set_postfix(
                         {
                             'ts': timestep.detach().item() / 1000,
                         },
                     )
+                # Apply real CFG
+                if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
+                    if do_batch_cfg:
+                        # Batched CFG: Split the noise prediction into unconditional and conditional parts
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred_cond - noise_pred_uncond)
+                    else:
+                        # Sequential CFG: Compute unconditional noise prediction separately
+                        noise_pred_uncond = self.transformer(
+                            hidden_states=latents.to(device=self.transformer.device),
+                            timestep=timestep / 1000,
+                            guidance=guidance,
+                            pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
+                            encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
+                            txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
+                            img_ids=latent_image_ids.to(device=self.transformer.device) if latent_image_ids is not None else None,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                        )[0]
+                        # Combine conditional and unconditional predictions
+                        noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred - noise_pred_uncond)
+                # Compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                # Ensure latents have the correct dtype
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
                         latents = latents.to(latents_dtype)
+                # Callback at the end of the step, if provided
                 if callback_on_step_end is not None:
+                    callback_kwargs = {k: locals()[k] for k in callback_on_step_end_tensor_inputs}
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.get("latents", latents)
+                    prompt_embeds = callback_outputs.get("prompt_embeds", prompt_embeds)
+                # Update the progress bar
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
+                # Mark step for XLA devices
                 if XLA_AVAILABLE:
                     xm.mark_step()