lora unloading initial

Browse files

Files changed (4) hide show

lora_loading.py +151 -43
modules/conditioner.py +2 -1
modules/flux_model.py +61 -5
util.py +5 -1

lora_loading.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import torch
 from loguru import logger
 from safetensors.torch import load_file
 from tqdm import tqdm
 try:
     from cublas_ops import CublasLinear
@@ -10,6 +13,24 @@ except Exception as e:
 from float8_quantize import F8Linear
 from modules.flux_model import Flux
 def swap_scale_shift(weight):
     scale, shift = weight.chunk(2, dim=0)
@@ -345,52 +366,74 @@ def get_lora_for_key(key: str, lora_weights: dict):
     return lora_A, lora_B, alpha
-@torch.inference_mode()
-def apply_lora_weight_to_module(
-    module_weight: torch.Tensor,
-    lora_weights: dict,
-    rank: int = None,
     lora_scale: float = 1.0,
 ):
     lora_A, lora_B, alpha = lora_weights
     uneven_rank = lora_B.shape[1] != lora_A.shape[0]
     rank_diff = lora_A.shape[0] / lora_B.shape[1]
     if rank is None:
         rank = lora_B.shape[1]
-    else:
-        rank = rank
     if alpha is None:
         alpha = rank
-    else:
-        alpha = alpha
-    w_dtype = module_weight.dtype
     dtype = torch.float32
-    device = module_weight.device
-    w_orig = module_weight.to(dtype=dtype, device=device)
     w_up = lora_A.to(dtype=dtype, device=device)
     w_down = lora_B.to(dtype=dtype, device=device)
-    # if not from_original_flux:
     if alpha != rank:
-        w_up = w_up * alpha / rank
     if uneven_rank:
         fused_lora = lora_scale * torch.mm(
             w_down.repeat_interleave(int(rank_diff), dim=1), w_up
         )
     else:
         fused_lora = lora_scale * torch.mm(w_down, w_up)
-    fused_weight = w_orig + fused_lora
-    return fused_weight.to(dtype=w_dtype, device=device)
 @torch.inference_mode()
-def apply_lora_to_model(model: Flux, lora_path: str, lora_scale: float = 1.0) -> Flux:
-    has_guidance = model.params.guidance_embed
-    logger.info(f"Loading LoRA weights for {lora_path}")
-    lora_weights = load_file(lora_path)
-    from_original_flux = False
     check_if_starts_with_transformer = [
         k for k in lora_weights.keys() if k.startswith("transformer.")
     ]
@@ -399,43 +442,108 @@ def apply_lora_to_model(model: Flux, lora_path: str, lora_scale: float = 1.0) ->
             lora_weights, 19, 38, has_guidance=has_guidance, prefix="transformer."
         )
     else:
-        from_original_flux = True
         lora_weights = convert_from_original_flux_checkpoint(lora_weights)
     logger.info("LoRA weights loaded")
     logger.debug("Extracting keys")
     keys_without_ab = [
         key.replace(".lora_A.weight", "")
         .replace(".lora_B.weight", "")
         .replace(".alpha", "")
         for key in lora_weights.keys()
     ]
     logger.debug("Keys extracted")
     keys_without_ab = list(set(keys_without_ab))
     for key in tqdm(keys_without_ab, desc="Applying LoRA", total=len(keys_without_ab)):
         module = get_module_for_key(key, model)
-        dtype = model.dtype
-        weight_is_f8 = False
-        if isinstance(module, F8Linear):
-            weight_is_f8 = True
-            weight_f16 = (
-                module.float8_data.clone()
-                .detach()
-                .float()
-                .mul(module.scale_reciprocal)
-                .to(module.weight.device)
-            )
-        elif isinstance(module, torch.nn.Linear):
-            weight_f16 = module.weight.clone().detach().float()
-        elif isinstance(module, CublasLinear):
-            weight_f16 = module.weight.clone().detach().float()
         lora_sd = get_lora_for_key(key, lora_weights)
-        weight_f16 = apply_lora_weight_to_module(
-            weight_f16, lora_sd, lora_scale=lora_scale
-        )
-        if weight_is_f8:
-            module.set_weight_tensor(weight_f16.type(dtype))
         else:
-            module.weight.data = weight_f16.type(dtype)
     logger.success("Lora applied")
     return model

+import re
+from typing import Optional, OrderedDict, Tuple, TypeAlias, Union
 import torch
 from loguru import logger
 from safetensors.torch import load_file
 from tqdm import tqdm
+from torch import nn
 try:
     from cublas_ops import CublasLinear
 from float8_quantize import F8Linear
 from modules.flux_model import Flux
+path_regex = re.compile(r"\/|\\")
+StateDict: TypeAlias = OrderedDict[str, torch.Tensor]
+class LoraWeights:
+    def __init__(
+        self,
+        weights: StateDict,
+        path: str,
+        name: str = None,
+        scale: float = 1.0,
+    ) -> None:
+        self.path = path
+        self.weights = weights
+        self.name = name if name else path_regex.split(path)[-1]
+        self.scale = scale
 def swap_scale_shift(weight):
     scale, shift = weight.chunk(2, dim=0)
     return lora_A, lora_B, alpha
+def calculate_lora_weight(
+    lora_weights: Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, float]],
+    rank: Optional[int] = None,
     lora_scale: float = 1.0,
+    device: Optional[Union[torch.device, int, str]] = None,
 ):
     lora_A, lora_B, alpha = lora_weights
+    if device is None:
+        device = lora_A.device
     uneven_rank = lora_B.shape[1] != lora_A.shape[0]
     rank_diff = lora_A.shape[0] / lora_B.shape[1]
     if rank is None:
         rank = lora_B.shape[1]
     if alpha is None:
         alpha = rank
     dtype = torch.float32
     w_up = lora_A.to(dtype=dtype, device=device)
     w_down = lora_B.to(dtype=dtype, device=device)
     if alpha != rank:
+        w_up = w_up * (alpha / rank)
     if uneven_rank:
         fused_lora = lora_scale * torch.mm(
             w_down.repeat_interleave(int(rank_diff), dim=1), w_up
         )
     else:
         fused_lora = lora_scale * torch.mm(w_down, w_up)
+    return fused_lora
 @torch.inference_mode()
+def unfuse_lora_weight_from_module(
+    fused_weight: torch.Tensor,
+    lora_weights: dict,
+    rank: Optional[int] = None,
+    lora_scale: float = 1.0,
+):
+    w_dtype = fused_weight.dtype
+    dtype = torch.float32
+    device = fused_weight.device
+    fused_weight = fused_weight.to(dtype=dtype, device=device)
+    fused_lora = calculate_lora_weight(lora_weights, rank, lora_scale, device=device)
+    module_weight = fused_weight - fused_lora
+    return module_weight.to(dtype=w_dtype, device=device)
+@torch.inference_mode()
+def apply_lora_weight_to_module(
+    module_weight: torch.Tensor,
+    lora_weights: dict,
+    rank: int = None,
+    lora_scale: float = 1.0,
+):
+    w_dtype = module_weight.dtype
+    dtype = torch.float32
+    device = module_weight.device
+    fused_lora = calculate_lora_weight(lora_weights, rank, lora_scale, device=device)
+    fused_weight = module_weight.to(dtype=dtype) + fused_lora
+    return fused_weight.to(dtype=w_dtype, device=device)
+def resolve_lora_state_dict(lora_weights, has_guidance: bool = True):
     check_if_starts_with_transformer = [
         k for k in lora_weights.keys() if k.startswith("transformer.")
     ]
             lora_weights, 19, 38, has_guidance=has_guidance, prefix="transformer."
         )
     else:
         lora_weights = convert_from_original_flux_checkpoint(lora_weights)
     logger.info("LoRA weights loaded")
     logger.debug("Extracting keys")
     keys_without_ab = [
         key.replace(".lora_A.weight", "")
         .replace(".lora_B.weight", "")
+        .replace(".lora_A", "")
+        .replace(".lora_B", "")
         .replace(".alpha", "")
         for key in lora_weights.keys()
     ]
     logger.debug("Keys extracted")
     keys_without_ab = list(set(keys_without_ab))
+    keys_without_ab = list(
+        set(
+            [
+                key.replace(".lora_A.weight", "")
+                .replace(".lora_B.weight", "")
+                .replace(".lora_A", "")
+                .replace(".lora_B", "")
+                .replace(".alpha", "")
+                for key in keys_without_ab
+            ]
+        )
+    )
+    return keys_without_ab, lora_weights
+def get_lora_weights(lora_path: str | StateDict):
+    if isinstance(lora_path, dict):
+        return lora_path, True
+    else:
+        return load_file(lora_path, "cpu"), False
+def extract_weight_from_linear(linear: Union[nn.Linear, CublasLinear, F8Linear]):
+    dtype = linear.weight.dtype
+    weight_is_f8 = False
+    if isinstance(linear, F8Linear):
+        weight_is_f8 = True
+        weight = (
+            linear.float8_data.clone()
+            .detach()
+            .float()
+            .mul(linear.scale_reciprocal)
+            .to(linear.weight.device)
+        )
+    elif isinstance(linear, torch.nn.Linear):
+        weight = linear.weight.clone().detach().float()
+    elif isinstance(linear, CublasLinear):
+        weight = linear.weight.clone().detach().float()
+    return weight, weight_is_f8, dtype
+@torch.inference_mode()
+def apply_lora_to_model(
+    model: Flux,
+    lora_path: str | StateDict,
+    lora_scale: float = 1.0,
+    return_lora_resolved: bool = False,
+) -> Flux:
+    has_guidance = model.params.guidance_embed
+    logger.info(f"Loading LoRA weights for {lora_path}")
+    lora_weights, _ = get_lora_weights(lora_path)
+    keys_without_ab, lora_weights = resolve_lora_state_dict(lora_weights, has_guidance)
     for key in tqdm(keys_without_ab, desc="Applying LoRA", total=len(keys_without_ab)):
         module = get_module_for_key(key, model)
+        weight, is_f8, dtype = extract_weight_from_linear(module)
         lora_sd = get_lora_for_key(key, lora_weights)
+        weight = apply_lora_weight_to_module(weight, lora_sd, lora_scale=lora_scale)
+        if is_f8:
+            module.set_weight_tensor(weight.type(dtype))
         else:
+            module.weight.data = weight.type(dtype)
     logger.success("Lora applied")
+    if return_lora_resolved:
+        return model, lora_weights
+    return model
+def remove_lora_from_module(
+    model: Flux,
+    lora_path: str | StateDict,
+    lora_scale: float = 1.0,
+):
+    has_guidance = model.params.guidance_embed
+    logger.info(f"Loading LoRA weights for {lora_path}")
+    lora_weights = get_lora_weights(lora_path)
+    lora_weights, _ = get_lora_weights(lora_path)
+    keys_without_ab, lora_weights = resolve_lora_state_dict(lora_weights, has_guidance)
+    for key in tqdm(keys_without_ab, desc="Unfusing LoRA", total=len(keys_without_ab)):
+        module = get_module_for_key(key, model)
+        weight, is_f8, dtype = extract_weight_from_linear(module)
+        lora_sd = get_lora_for_key(key, lora_weights)
+        weight = unfuse_lora_weight_from_module(weight, lora_sd, lora_scale=lora_scale)
+        if is_f8:
+            module.set_weight_tensor(weight.type(dtype))
+        else:
+            module.weight.data = weight.type(dtype)
+    logger.success("Lora unfused")
     return model

modules/conditioner.py CHANGED Viewed

@@ -43,6 +43,7 @@ class HFEmbedder(nn.Module):
         device: torch.device | int,
         quantization_dtype: str | None = None,
         offloading_device: torch.device | int | None = torch.device("cpu"),
         **hf_kwargs,
     ):
         super().__init__()
@@ -54,7 +55,7 @@ class HFEmbedder(nn.Module):
         self.device = (
             device if isinstance(device, torch.device) else torch.device(device)
         )
-        self.is_clip = version.startswith("openai")
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"

         device: torch.device | int,
         quantization_dtype: str | None = None,
         offloading_device: torch.device | int | None = torch.device("cpu"),
+        is_clip: bool = False,
         **hf_kwargs,
     ):
         super().__init__()
         self.device = (
             device if isinstance(device, torch.device) else torch.device(device)
         )
+        self.is_clip = version.startswith("openai") or is_clip
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"

modules/flux_model.py CHANGED Viewed

@@ -1,11 +1,13 @@
-from collections import namedtuple
 import os
-from typing import TYPE_CHECKING
 import torch
 if TYPE_CHECKING:
     from util import ModelSpec
 DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
@@ -14,8 +16,8 @@ torch.backends.cudnn.benchmark_limit = 20
 torch.set_float32_matmul_precision("high")
 import math
-from torch import Tensor, nn
 from pydantic import BaseModel
 from torch.nn import functional as F
@@ -345,6 +347,7 @@ class DoubleStreamBlock(nn.Module):
         self.H = self.num_heads
         self.KH = self.K * self.H
         self.do_clamp = dtype == torch.float16
     def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         B, L, D = x.shape
         q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
@@ -512,6 +515,7 @@ class Flux(nn.Module):
         self.params = config.params
         self.in_channels = config.params.in_channels
         self.out_channels = self.in_channels
         prequantized_flow = config.prequantized_flow
         quantized_embedders = config.quantize_flow_embedder_layers and prequantized_flow
         quantized_modulation = config.quantize_modulation and prequantized_flow
@@ -614,6 +618,57 @@ class Flux(nn.Module):
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
     def forward(
         self,
         img: Tensor,
@@ -664,9 +719,10 @@ class Flux(nn.Module):
     def from_pretrained(
         cls: "Flux", path: str, dtype: torch.dtype = torch.float16
     ) -> "Flux":
-        from util import load_config_from_path
         from safetensors.torch import load_file
         config = load_config_from_path(path)
         with torch.device("meta"):
             klass = cls(config=config, dtype=dtype)

 import os
+from collections import namedtuple
+from typing import TYPE_CHECKING, List
 import torch
+from loguru import logger
 if TYPE_CHECKING:
+    from lora_loading import LoraWeights
     from util import ModelSpec
 DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 torch.set_float32_matmul_precision("high")
 import math
 from pydantic import BaseModel
+from torch import Tensor, nn
 from torch.nn import functional as F
         self.H = self.num_heads
         self.KH = self.K * self.H
         self.do_clamp = dtype == torch.float16
     def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         B, L, D = x.shape
         q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
         self.params = config.params
         self.in_channels = config.params.in_channels
         self.out_channels = self.in_channels
+        self.loras: List[LoraWeights] = []
         prequantized_flow = config.prequantized_flow
         quantized_embedders = config.quantize_flow_embedder_layers and prequantized_flow
         quantized_modulation = config.quantize_modulation and prequantized_flow
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def get_lora(self, identifier: str):
+        for lora in self.loras:
+            if lora.path == identifier or lora.name == identifier:
+                return lora
+    def has_lora(self, identifier: str):
+        for lora in self.loras:
+            if lora.path == identifier or lora.name == identifier:
+                return True
+    def load_lora(self, path: str, scale: float, name: str = None):
+        from lora_loading import (
+            LoraWeights,
+            apply_lora_to_model,
+            remove_lora_from_module,
+        )
+        if self.has_lora(path):
+            lora = self.get_lora(path)
+            if lora.scale == scale:
+                logger.warning(
+                    f"Lora {lora.name} already loaded with same scale - ignoring!"
+                )
+            else:
+                remove_lora_from_module(self, lora, lora.scale)
+                apply_lora_to_model(self, lora, scale)
+                for idx, lora_ in enumerate(self.loras):
+                    if lora_.path == lora.path:
+                        self.loras[idx].scale = scale
+                        break
+        else:
+            _, lora = apply_lora_to_model(self, path, scale, return_lora_resolved=True)
+            self.loras.append(LoraWeights(lora, path, name, scale))
+    def unload_lora(self, path_or_identifier: str):
+        from lora_loading import remove_lora_from_module
+        removed = False
+        for idx, lora_ in enumerate(list(self.loras)):
+            if lora_.path == path_or_identifier or lora_.name == path_or_identifier:
+                remove_lora_from_module(self, lora_.weights, lora_.scale)
+                self.loras.pop(idx)
+                removed = True
+                break
+        if not removed:
+            logger.warning(
+                f"Couldn't remove lora {path_or_identifier} as it wasn't found fused to the model!"
+            )
+        else:
+            logger.info("Successfully removed lora from module.")
     def forward(
         self,
         img: Tensor,
     def from_pretrained(
         cls: "Flux", path: str, dtype: torch.dtype = torch.float16
     ) -> "Flux":
         from safetensors.torch import load_file
+        from util import load_config_from_path
         config = load_config_from_path(path)
         with torch.device("meta"):
             klass = cls(config=config, dtype=dtype)

util.py CHANGED Viewed

@@ -34,11 +34,14 @@ class QuantizationDtype(StrEnum):
     bfloat16 = "bfloat16"
     float16 = "float16"
 class ModelSpec(BaseModel):
     version: ModelVersion
     params: FluxParams
     ae_params: AutoEncoderParams
     ckpt_path: str | None
     ae_path: str | None
     repo_id: str | None
     repo_flow: str | None
@@ -255,10 +258,11 @@ def load_flow_model(config: ModelSpec) -> Flux:
 def load_text_encoders(config: ModelSpec) -> tuple[HFEmbedder, HFEmbedder]:
     clip = HFEmbedder(
-        "openai/clip-vit-large-patch14",
         max_length=77,
         torch_dtype=into_dtype(config.text_enc_dtype),
         device=into_device(config.text_enc_device).index or 0,
         quantization_dtype=config.clip_quantization_dtype,
     )
     t5 = HFEmbedder(

     bfloat16 = "bfloat16"
     float16 = "float16"
 class ModelSpec(BaseModel):
     version: ModelVersion
     params: FluxParams
     ae_params: AutoEncoderParams
     ckpt_path: str | None
+    # Add option to pass in custom clip model
+    clip_path: str | None = "openai/clip-vit-large-patch14"
     ae_path: str | None
     repo_id: str | None
     repo_flow: str | None
 def load_text_encoders(config: ModelSpec) -> tuple[HFEmbedder, HFEmbedder]:
     clip = HFEmbedder(
+        config.clip_path,
         max_length=77,
         torch_dtype=into_dtype(config.text_enc_dtype),
         device=into_device(config.text_enc_device).index or 0,
+        is_clip=True,
         quantization_dtype=config.clip_quantization_dtype,
     )
     t5 = HFEmbedder(