Spaces:
Sleeping
Sleeping
File size: 1,992 Bytes
5325fcc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# @package __global__
defaults:
- _self_
- /model/lm/model_scale: base # prefer this group to set model scale instead of transformer_lm keys directly
lm_model: transformer_lm
codebooks_pattern:
modeling: parallel
transformer_lm:
dim: 512
num_heads: 8
num_layers: 8
hidden_scale: 4
n_q: 8 # number of streams to model
card: 1024
dropout: 0.
emb_lr: null
activation: gelu
norm_first: false # use pre-norm instead of post-norm
bias_ff: true # use bias for the feedforward
bias_attn: true # use bias for the attention
bias_proj: true # use bias for the output projections
past_context: null
causal: true
custom: false # use custom MHA implementation
memory_efficient: false # use flash attention
attention_as_float32: false # use float32 for the attention part,
# recommended at the moment when memory_efficient is True.
layer_scale: null
positional_embedding: sin # positional embedding strategy (sin, rope, or sin_rope).
xpos: false # apply xpos decay (rope only).
checkpointing: none # layer checkpointing method, can be none, torch, xformers_default.
# torch is the slowest but uses the least memory,
# xformers_default is somewhere in between.
weight_init: null # weight initialization (null, gaussian or uniform)
depthwise_init: null # perform depthwise initialization (null, current, global)
zero_bias_init: false # initialize bias to zero if bias in linears and
# if a weight_init method is used.
norm: layer_norm # normalization method to use in transformer.
cross_attention: false
qk_layer_norm: false
qk_layer_norm_cross: false
attention_dropout: null
kv_repeat: 1
two_step_cfg: false # whether to do true 2 steps CFG, potentially resolving some padding issues or not...
|