File size: 6,414 Bytes
2646361 13c4251 2646361 2e3ebcb 95b4916 09dbf45 2e3ebcb 2646361 09dbf45 2646361 2e3ebcb 2646361 09dbf45 2646361 2e3ebcb 2646361 f9b3adb 2e3ebcb 4434bf3 2e3ebcb 09dbf45 9db6c6f f9b3adb 4b000ec 6a92924 4b000ec 9db6c6f 13c4251 8542ad8 2646361 13c4251 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from typing import Any, Dict, List, Optional, Union
import torch
from transformers import PretrainedConfig
class XLMRobertaFlashConfig(PretrainedConfig):
model_type = "xlm-roberta"
def __init__(
self,
vocab_size: int = 250002,
hidden_size: int = 1024,
num_hidden_layers: int = 24,
num_attention_heads: int = 16,
intermediate_size: int = 4096,
hidden_act: str = "gelu",
hidden_dropout_prob: float = 0.1,
attention_probs_dropout_prob: float = 0.1,
max_position_embeddings: int = 8194,
type_vocab_size: int = 1,
initializer_range: float = 0.02,
layer_norm_eps: float = 1e-05,
pad_token_id: int = 1,
bos_token_id: int = 0,
eos_token_id: int = 2,
position_embedding_type: str = "rotary",
rotary_emb_base: float = 10000.0,
use_cache: bool = True,
use_reentrant: bool = False,
classifier_dropout: Optional[float] = None,
lora_adaptations: Optional[List[str]] = None,
lora_prompts: Optional[Dict[str, str]] = None,
lora_rank: int = 4,
lora_dropout_p: float = 0.0,
lora_alpha: int = 1,
lora_main_params_trainable: bool = False,
load_trained_adapters: bool = False,
use_flash_attn: bool = True,
torch_dtype: Optional[Union[str, torch.dtype]] = None,
emb_pooler: Optional[str] = None,
matryoshka_dimensions: Optional[List[int]] = None,
truncate_dim: Optional[int] = None,
**kwargs: Dict[str, Any],
):
"""
Initialize the XLMRobertaFlashConfig configuration.
Args:
vocab_size (int): Size of the vocabulary.
hidden_size (int): Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (int): Number of hidden layers in the Transformer encoder.
num_attention_heads (int): Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (int): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer.
hidden_act (str): The activation function to use.
hidden_dropout_prob (float): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (float): The dropout ratio for the attention probabilities.
max_position_embeddings (int): The maximum length of the position embeddings.
type_vocab_size (int): The vocabulary size of the token type ids.
initializer_range (float): The standard deviation for initializing all weight matrices.
layer_norm_eps (float): The epsilon used by the layer normalization layers.
pad_token_id (int): The ID of the padding token.
bos_token_id (int): The ID of the beginning-of-sequence token.
eos_token_id (int): The ID of the end-of-sequence token.
position_embedding_type (str): Type of position embeddings. Options are 'absolute', 'alibi', or 'rotary'.
rotary_emb_base (float): Base for rotary embeddings.
use_cache (bool): Whether or not the model should return the last key/values attentions (not used by all models).
use_reentrant (bool): Whether or not the model should enable the 'use_reentrant' flag in gradient checkpointing.
classifier_dropout (Optional[float]): The dropout ratio for the classification head.
lora_adaptations (Optional[List[str]]): LoRA adaptations configuration.
lora_prompts (Optional[Dict[str, str]]): LoRA prompts configuration.
lora_rank (int): Rank for LoRA adaptations.
lora_dropout_p (float): Dropout probability for LoRA adaptations.
lora_alpha (int): Alpha parameter for LoRA.
lora_main_params_trainable (bool): Whether to make the main model parameters trainable when using LoRA.
load_trained_adapters (bool): Whether to load trained adapters.
use_flash_attn (bool): Whether to use FlashAttention.
torch_dtype (Optional[Union[str, torch.dtype]]): Data type for the tensors.
emb_pooler (Optional[str]): Pooling layer configuration.
matryoshka_dimensions (Optional[List[int]]): Configuration for matryoshka dimension reduction.
truncate_dim (Optional[int]): Dimension to truncate embeddings to, if any.
**kwargs (Dict[str, Any]): Additional keyword arguments passed to the configuration.
"""
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.rotary_emb_base = rotary_emb_base
self.use_cache = use_cache
self.use_reentrant = use_reentrant
self.classifier_dropout = classifier_dropout
self.load_trained_adapters = load_trained_adapters
self.lora_adaptations = lora_adaptations
self.lora_prompts = lora_prompts
self.lora_rank = lora_rank
self.lora_dropout_p = lora_dropout_p
self.lora_alpha = lora_alpha
self.lora_main_params_trainable = lora_main_params_trainable
self.use_flash_attn = use_flash_attn
self.emb_pooler = emb_pooler
self.matryoshka_dimensions = matryoshka_dimensions
self.truncate_dim = truncate_dim
if (
torch_dtype
and hasattr(torch, torch_dtype)
and type(getattr(torch, torch_dtype)) is torch.dtype
):
self.torch_dtype = getattr(torch, torch_dtype)
else:
self.torch_dtype = torch_dtype
|