File size: 8,870 Bytes
5df4390 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# Copyright 2022 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
"""A HuggingFace-style model configuration."""
from typing import Optional, Tuple, Union
from transformers import PretrainedConfig
class MosaicGPTConfig(PretrainedConfig):
model_type = 'mosaic_gpt'
def __init__(
self,
d_model: int = 2048,
n_heads: int = 16,
n_layers: int = 24,
mlp_ratio: int = 4,
max_seq_len: int = 2048,
vocab_size: int = 50368,
attn_pdrop: float = 0.0,
resid_pdrop: float = 0.0,
emb_pdrop: float = 0.0,
attn_impl: str = 'triton',
attn_qk_ln: bool = False,
attn_clip_qkv: Optional[float] = None,
softmax_scale: Optional[float] = None,
prefix_lm: Optional[bool] = False,
attn_uses_sequence_id: Optional[bool] = False,
alibi: bool = False,
alibi_bias_max: int = 8,
init_device: str = 'cpu',
logit_scale: Optional[Union[float, str]] = None,
no_bias: bool = False,
verbose: int = 0,
param_init_fn: str = 'kaiming_normal_',
init_div_is_residual: Union[int, float, str, bool] = True,
init_std: float = 0.02,
emb_init_std: Optional[float] = None,
emb_init_uniform_lim: Optional[Union[Tuple[float, float],
float]] = None,
init_gain: float = 0,
fan_mode: str = 'fan_in',
init_nonlinearity: str = 'relu',
embedding_fraction: float = 1.0,
low_precision_layernorm: bool = True,
use_cache: bool = False,
**kwargs,
):
"""The MosaicGPT configuration class.
Args:
d_model (int): The size of the embedding dimension of the model.
n_heads (int): The number of attention heads.
n_layers (int): The number of layers in the model.
mlp_ratio (int): The ratio of the up/down scale in the MLP.
max_seq_len (int): The maximum sequence length of the model.
vocab_size (int): The size of the vocabulary.
attn_pdrop (float): The dropout probability for the attention layers.
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
emb_pdrop (float): The dropout probability for the embedding layer.
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
attn_qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
attn_clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
this value.
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
use the default scale of ``1/sqrt(d_keys)``.
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
which sub-sequence each token belongs to.
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
alibi (bool): Whether to use the alibi bias instead of position embeddings.
alibi_bias_max (int): The maximum value of the alibi bias.
init_device (str): The device to use for parameter initialization.
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
no_bias (bool): Whether to use bias in all layers.
verbose (int): The verbosity level. 0 is silent.
param_init_fn (str): The parameter initialization scheme to use. One of 'default_', 'baseline_', 'kaiming_uniform_',
'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or 'xavier_normal_'.
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
init_std (float): The standard deviation of the normal distribution used to initialize the model,
if using the baseline_ parameter initialization scheme.
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
low_precision_layernorm (bool): Whether to use low precision layer normalization.
use_cache (bool): Whether or not the model should return the last key/values attentions
"""
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.mlp_ratio = mlp_ratio
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.attn_pdrop = attn_pdrop
self.resid_pdrop = resid_pdrop
self.emb_pdrop = emb_pdrop
self.attn_impl = attn_impl
self.attn_qk_ln = attn_qk_ln
self.attn_clip_qkv = attn_clip_qkv
self.softmax_scale = softmax_scale
self.prefix_lm = prefix_lm
self.attn_uses_sequence_id = attn_uses_sequence_id
self.alibi = alibi
self.alibi_bias_max = alibi_bias_max
self.init_device = init_device
self.logit_scale = logit_scale
self.no_bias = no_bias
self.verbose = verbose
self.param_init_fn = param_init_fn
self.init_div_is_residual = init_div_is_residual
self.init_std = init_std
self.emb_init_std = emb_init_std
self.emb_init_uniform_lim = emb_init_uniform_lim
self.init_std = init_std
self.init_gain = init_gain
self.fan_mode = fan_mode
self.init_nonlinearity = init_nonlinearity
self.embedding_fraction = embedding_fraction
self.low_precision_layernorm = low_precision_layernorm
self.use_cache = use_cache
if 'name' in kwargs:
del kwargs['name']
if 'loss_fn' in kwargs:
del kwargs['loss_fn']
super().__init__(**kwargs)
self._validate_config()
def _validate_config(self):
if self.d_model % self.n_heads != 0:
raise ValueError('d_model must be divisible by n_heads')
if any(prob < 0 or prob > 1
for prob in [self.attn_pdrop, self.resid_pdrop, self.emb_pdrop]):
raise ValueError(
'attn_pdrop, resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1'
)
if self.attn_impl not in ['torch', 'flash', 'triton']:
raise ValueError(f'Unknown attn_impl={self.attn_impl}')
if self.prefix_lm and self.attn_impl not in ['torch', 'triton']:
raise NotImplementedError(
'prefix_lm only implemented with torch and triton attention.')
if self.alibi and self.attn_impl not in ['torch', 'triton']:
raise NotImplementedError(
'alibi only implemented with torch and triton attention.')
if self.attn_uses_sequence_id and self.attn_impl not in [
'torch', 'triton'
]:
raise NotImplementedError(
'attn_uses_sequence_id only implemented with torch and triton attention.'
)
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
raise ValueError(
'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
)
if isinstance(self.logit_scale,
str) and self.logit_scale != 'inv_sqrt_d_model':
raise ValueError(
f"{self.logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
)
|