jinaai
/

xlm-roberta-flash-implementation

@@ -11,16 +11,12 @@ from torch.nn import Parameter
 from torch.nn import functional as F
 from transformers import PretrainedConfig
-from .rotary import RotaryEmbedding
-from .mlp import FusedMLP, Mlp
-from .xlm_padding import index_first_axis_residual, pad_input, unpad_input
-from .stochastic_depth import stochastic_depth
-from .mha import MHA
-from .block import Block
 from .configuration_xlm_roberta import XLMRobertaFlashConfig
-from .embedding import XLMRobertaEmbeddings
-from .modeling_xlm_roberta import (XLMRobertaFlashConfig, XLMRobertaModel,
-                                   XLMRobertaPreTrainedModel)
 def initialized_weights(
@@ -336,7 +332,7 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         **kwargs,
     ):
         for key in list(kwargs.keys()):
-            if key in config:
                 config.update({key: kwargs.pop(key)})
         if config.load_trained_adapters:  # checkpoint already contains LoRA adapters
             return super().from_pretrained(
@@ -350,11 +346,14 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
                 token=token,
                 revision=revision,
                 use_safetensors=use_safetensors,
-                **kwargs
             )
         else:  # initializing new adapters
             roberta = XLMRobertaModel.from_pretrained(
-                pretrained_model_name_or_path, *model_args, use_flash_attn=config.use_flash_attn, **kwargs
             )
             return cls(config, roberta=roberta)
@@ -418,7 +417,10 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             if isinstance(sentences, str):
                 sentences = self._task_instructions[task] + sentences
             else:
-                sentences = [self._task_instructions[task] + sentence for sentence in sentences]
         return self.roberta.encode(
             sentences, *args, adapter_mask=adapter_mask, **kwargs
         )

 from torch.nn import functional as F
 from transformers import PretrainedConfig
 from .configuration_xlm_roberta import XLMRobertaFlashConfig
+from .modeling_xlm_roberta import (
+    XLMRobertaFlashConfig,
+    XLMRobertaModel,
+    XLMRobertaPreTrainedModel,
+)
 def initialized_weights(
         **kwargs,
     ):
         for key in list(kwargs.keys()):
+            if key in config.to_dict():
                 config.update({key: kwargs.pop(key)})
         if config.load_trained_adapters:  # checkpoint already contains LoRA adapters
             return super().from_pretrained(
                 token=token,
                 revision=revision,
                 use_safetensors=use_safetensors,
+                **kwargs,
             )
         else:  # initializing new adapters
             roberta = XLMRobertaModel.from_pretrained(
+                pretrained_model_name_or_path,
+                *model_args,
+                use_flash_attn=config.use_flash_attn,
+                **kwargs,
             )
             return cls(config, roberta=roberta)
             if isinstance(sentences, str):
                 sentences = self._task_instructions[task] + sentences
             else:
+                sentences = [
+                    self._task_instructions[task] + sentence for sentence in sentences
+                ]
         return self.roberta.encode(
             sentences, *args, adapter_mask=adapter_mask, **kwargs
         )