|
--- |
|
base_model: |
|
- TheHierophant/Underground-Cognitive-V0.3-test |
|
library_name: transformers |
|
tags: |
|
- mergekit |
|
- merge |
|
|
|
--- |
|
# merge |
|
|
|
This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit). |
|
|
|
## Merge Details |
|
### Merge Method |
|
|
|
This model was merged using the passthrough merge method. |
|
|
|
### Models Merged |
|
|
|
The following models were included in the merge: |
|
* [TheHierophant/Underground-Cognitive-V0.3-test](https://huggingface.co/TheHierophant/Underground-Cognitive-V0.3-test) |
|
|
|
### Configuration |
|
|
|
The following YAML configuration was used to produce this model: |
|
|
|
```yaml |
|
slices: |
|
- sources: |
|
- model: TheHierophant/Underground-Cognitive-V0.3-test |
|
layer_range: [0, 8] |
|
parameters: |
|
attention: |
|
- filter: q_proj |
|
value: 1.15 |
|
- filter: k_proj |
|
value: 1.1 |
|
- filter: v_proj |
|
value: 1.2 |
|
- filter: down_proj |
|
value: 1.1 |
|
significance: 0.85 # A帽adir relevancia para reforzar c谩lculos precisos en las primeras capas |
|
weight: 0.3 # Peso para garantizar que las capas iniciales manejen la informaci贸n con precisi贸n |
|
rope_scaling: |
|
type: "linear" |
|
value: 0.7 # Ajuste para mejorar la capacidad posicional |
|
- sources: |
|
- model: TheHierophant/Underground-Cognitive-V0.3-test |
|
layer_range: [8, 16] |
|
parameters: |
|
attention: |
|
- filter: q_proj |
|
value: 1.25 |
|
- filter: k_proj |
|
value: 1.15 |
|
- filter: v_proj |
|
value: 1.3 |
|
- filter: down_proj |
|
value: 1.2 |
|
weight: 0.35 |
|
significance: 0.9 # Aumentar el enfoque en la l贸gica matem谩tica b谩sica para diferenciar iteraci贸n y recursi贸n |
|
rope_scaling: |
|
type: "linear" |
|
value: 0.85 # Factor din谩mico para asegurar flexibilidad y ajuste autom谩tico en c谩lculos |
|
- sources: |
|
- model: TheHierophant/Underground-Cognitive-V0.3-test |
|
layer_range: [16, 32] |
|
parameters: |
|
attention: |
|
- filter: o_proj |
|
value: 1.5 |
|
- filter: q_proj |
|
value: 1.4 |
|
- filter: v_proj |
|
value: 1.35 |
|
- filter: down_proj |
|
value: 1.3 |
|
weight: 0.4 # M谩s peso a capas medias para reforzar razonamiento intermedio |
|
significance: 0.85 # Focalizaci贸n para la transferencia de atenci贸n a capas superiores |
|
- sources: |
|
- model: TheHierophant/Underground-Cognitive-V0.3-test |
|
layer_range: [32, 48] |
|
parameters: |
|
attention: |
|
- filter: o_proj |
|
value: 2.0 |
|
- filter: q_proj |
|
value: 1.8 |
|
- filter: v_proj |
|
value: 1.7 |
|
- filter: down_proj |
|
value: 1.65 |
|
weight: 0.5 # Incremento de peso para fortalecer las capas profundas y el an谩lisis complejo |
|
significance: 0.95 # Enfoque en mejorar la l贸gica no lineal y reforzar la atenci贸n en problemas complejos |
|
base_model_config: |
|
attention_bias: false |
|
attention_dropout: 0.05 # A帽adir dropout para prevenir sobreajuste en c谩lculos repetitivos |
|
hidden_act: "silu" # Mantener la funci贸n silu para una activaci贸n suave y continua |
|
hidden_size: 4096 |
|
initializer_range: 0.02 |
|
intermediate_size: 14336 |
|
max_position_embeddings: 4096 |
|
num_attention_heads: 32 |
|
num_hidden_layers: 48 |
|
num_key_value_heads: 8 |
|
pretraining_tp: 1 |
|
rms_norm_eps: 1e-05 |
|
rope_scaling: |
|
type: "linear" # Ajuste basado en el tipo linear para optimizar la capacidad adaptativa |
|
value: 1.1 # Factor ajustado para mantener un escalamiento eficiente |
|
rope_theta: 12000.0 # Ajustado para mejorar la capacidad posicional en tareas matem谩ticas |
|
tie_word_embeddings: false |
|
vocab_size: 32000 |
|
use_cache: true # Habilitar cache para mejorar eficiencia durante la inferencia |
|
dtype: bfloat16 |
|
merge_method: passthrough |
|
``` |
|
|