File size: 2,697 Bytes
cb9e677 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
from typing import NamedTuple
import torch
import torch.nn as nn
class LoRALinear(nn.Module):
"""
Implementation of:
- LoRA: https://arxiv.org/abs/2106.09685
Notes:
- Freezing is handled at network level, not layer level.
- Scaling factor controls relative importance of LoRA skip
connection versus original frozen weight. General guidance is
to keep it to 2.0 and sweep over learning rate when changing
the rank.
"""
def __init__(
self,
in_features: int,
out_features: int,
rank: int,
scaling: float,
dropout: float,
bias: bool = False,
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
assert not bias
self.bias = bias
self.rank = rank
self.scaling = scaling
self.dropout = nn.Dropout(p=dropout)
self.lora_A = nn.Linear(
self.in_features,
self.rank,
bias=self.bias,
)
self.lora_B = nn.Linear(
self.rank,
self.out_features,
bias=self.bias,
)
self.frozen_W = nn.Linear(self.in_features, self.out_features, bias=self.bias)
# make sure no LoRA weights are marked as "missing" in load_state_dict
def ignore_missing_keys(m: nn.Module, incompatible_keys: NamedTuple):
# empty missing keys in place
incompatible_keys.missing_keys[:] = [] # type: ignore
self.register_load_state_dict_post_hook(ignore_missing_keys)
def merge_weight(self):
with torch.no_grad():
down_weight = self.lora_A.weight
up_weight = self.lora_B.weight
weight = up_weight.mm(down_weight) * self.scaling
weight += self.frozen_W.weight
return weight
def _load_from_state_dict(
self,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
key_name = prefix + "weight"
# full checkpoint
if key_name in state_dict:
w_ref = state_dict[key_name]
# load frozen weights
self.frozen_W.load_state_dict({"weight": w_ref}, assign=True)
def forward(self, x: torch.Tensor):
lora = self.lora_B(self.lora_A(self.dropout(x)))
return self.frozen_W(x) + lora * self.scaling
def __repr__(self) -> str:
return "{}Linear(in_features={}, out_features={}, r={}, dropout={})".format(
"LoRA", self.in_features, self.out_features, self.rank, self.dropout.p
)
|