CognitivessAI
/

cognitivess

@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Cognitivess AI and the HuggingFace Inc. team. All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,6 +13,7 @@
 # limitations under the License.
 """Flax Cognitivess model."""
 from typing import Optional, Tuple
 import flax.linen as nn
@@ -26,22 +26,17 @@ from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
-from ...modeling_flax_outputs import (
-    FlaxBaseModelOutput,
-    FlaxBaseModelOutputWithPast,
-    FlaxCausalLMOutput,
-    FlaxCausalLMOutputWithCrossAttentions,
-)
-from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from .configuration_Cognitivess import CognitivessConfig
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "CognitivessConfig"
-_REAL_CHECKPOINT_FOR_DOC = "CognitivessAI/cognitivess"
-_CHECKPOINT_FOR_DOC = "ksmcg/Cognitivess-tiny"
 Cognitivess_START_DOCSTRING = r"""
@@ -127,7 +122,27 @@ Cognitivess_INPUTS_DOCSTRING = r"""
 """
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Cognitivess
 class FlaxCognitivessRMSNorm(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
@@ -146,7 +161,6 @@ class FlaxCognitivessRMSNorm(nn.Module):
         return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Cognitivess
 class FlaxCognitivessRotaryEmbedding(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
@@ -168,86 +182,46 @@ class FlaxCognitivessRotaryEmbedding(nn.Module):
         return key, query
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Cognitivess
-class FlaxCognitivessMLP(nn.Module):
-    config: CognitivessConfig
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        embed_dim = self.config.hidden_size
-        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
-        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
-        self.act = ACT2FN[self.config.hidden_act]
-        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
-        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
-        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
-    def __call__(self, hidden_states):
-        up_proj_states = self.up_proj(hidden_states)
-        gate_states = self.act(self.gate_proj(hidden_states))
-        hidden_states = self.down_proj(up_proj_states * gate_states)
-        return hidden_states
-# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
-    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
-# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions
-def create_sinusoidal_positions(num_pos, dim):
-    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
-    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
-    emb = np.concatenate((freqs, freqs), axis=-1)
-    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
-    return jnp.array(out[:, :, :num_pos])
-# Copied from transformers.models.llama.modeling_flax_llama.rotate_half
-def rotate_half(tensor):
-    """Rotates half the hidden dims of the input."""
-    rotate_half_tensor = jnp.concatenate(
-        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
-    )
-    return rotate_half_tensor
 class FlaxCognitivessAttention(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         config = self.config
-        self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
         self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
-        self.rope_theta = config.rope_theta
-        if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype)
-        self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
-        self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
-        self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype)
-        casual_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
-        self.causal_mask = jnp.triu(casual_mask, k=-config.sliding_window)
         self.rotary_emb = FlaxCognitivessRotaryEmbedding(config, dtype=self.dtype)
     def _split_heads(self, hidden_states, num_heads):
         return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
     def _merge_heads(self, hidden_states):
-        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
     @nn.compact
     # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
@@ -284,23 +258,25 @@ class FlaxCognitivessAttention(nn.Module):
     def __call__(
         self,
-        hidden_states: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        position_ids: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
-        output_attentions: bool = False,
         init_cache: bool = False,
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = self._split_heads(query_states, self.num_heads)
-        key_states = self._split_heads(key_states, self.num_key_value_heads)
-        value_states = self._split_heads(value_states, self.num_key_value_heads)
-        key_states, query_states = self.rotary_emb(key_states, query_states, position_ids)
-        query_length, key_length = query_states.shape[1], key_states.shape[1]
         if self.has_variable("cache", "cached_key"):
             mask_shift = self.variables["cache"]["cache_index"]
             max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
@@ -312,16 +288,23 @@ class FlaxCognitivessAttention(nn.Module):
         batch_size = hidden_states.shape[0]
         causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
         attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
         attention_mask = combine_masks(attention_mask, causal_mask)
         if self.has_variable("cache", "cached_key") or init_cache:
-            key_states, value_states, attention_mask = self._concatenate_to_cache(
-                key_states, value_states, query_states, attention_mask
-            )
-        key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2)
-        value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2)
         attention_bias = lax.select(
             attention_mask > 0,
             jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
@@ -331,18 +314,19 @@ class FlaxCognitivessAttention(nn.Module):
         # usual dot product attention
         attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
         attn_weights = dot_product_attention_weights(
-            query_states,
-            key_states,
             bias=attention_bias,
-            deterministic=deterministic,
             dropout_rate=self.config.attention_dropout,
             dtype=attention_dtype,
         )
         if self.attention_softmax_in_fp32:
             attn_weights = attn_weights.astype(self.dtype)
-        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
         attn_output = self._merge_heads(attn_output)
         attn_output = self.o_proj(attn_output)
@@ -350,7 +334,29 @@ class FlaxCognitivessAttention(nn.Module):
         return outputs
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Cognitivess
 class FlaxCognitivessDecoderLayer(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
@@ -526,7 +532,6 @@ class FlaxCognitivessPreTrainedModel(FlaxPreTrainedModel):
         return outputs
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Cognitivess
 class FlaxCognitivessLayerCollection(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
@@ -573,7 +578,6 @@ class FlaxCognitivessLayerCollection(nn.Module):
         return outputs
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Cognitivess
 class FlaxCognitivessModule(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
@@ -644,13 +648,12 @@ class FlaxCognitivessModel(FlaxCognitivessPreTrainedModel):
 append_call_sample_docstring(
     FlaxCognitivessModel,
     _CHECKPOINT_FOR_DOC,
-    FlaxBaseModelOutputWithPast,
     _CONFIG_FOR_DOC,
     real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Cognitivess
 class FlaxCognitivessForCausalLMModule(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
@@ -701,7 +704,6 @@ class FlaxCognitivessForCausalLMModule(nn.Module):
     """,
     Cognitivess_START_DOCSTRING,
 )
 # Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Cognitivess
 class FlaxCognitivessForCausalLM(FlaxCognitivessPreTrainedModel):
     module_class = FlaxCognitivessForCausalLMModule
@@ -736,7 +738,7 @@ class FlaxCognitivessForCausalLM(FlaxCognitivessPreTrainedModel):
 append_call_sample_docstring(
     FlaxCognitivessForCausalLM,
     _CHECKPOINT_FOR_DOC,
-    FlaxCausalLMOutputWithCrossAttentions,
     _CONFIG_FOR_DOC,
     real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )

 # coding=utf-8
+# Copyright 2023 Cognitivess and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # limitations under the License.
 """Flax Cognitivess model."""
+from functools import partial
 from typing import Optional, Tuple
 import flax.linen as nn
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_Cognitivess import CognitivessConfig
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "CognitivessConfig"
+_CHECKPOINT_FOR_DOC = "afmck/testing-Cognitivess-tiny"
+_REAL_CHECKPOINT_FOR_DOC = "openlm-research/open_Cognitivess_3b_v2"
 Cognitivess_START_DOCSTRING = r"""
 """
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+    emb = np.concatenate((freqs, freqs), axis=-1)
+    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
+    return jnp.array(out[:, :, :num_pos])
+def rotate_half(tensor):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
+    return rotate_half_tensor
+def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
+    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
 class FlaxCognitivessRMSNorm(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
         return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
 class FlaxCognitivessRotaryEmbedding(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
         return key, query
 class FlaxCognitivessAttention(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
     def setup(self):
         config = self.config
+        self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+        dense = partial(
+            nn.Dense,
+            use_bias=config.attention_bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.q_proj = dense(self.num_heads * self.head_dim)
+        self.k_proj = dense(self.num_key_value_heads * self.head_dim)
+        self.v_proj = dense(self.num_key_value_heads * self.head_dim)
+        self.o_proj = dense(self.embed_dim)
+        if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
         self.rotary_emb = FlaxCognitivessRotaryEmbedding(config, dtype=self.dtype)
     def _split_heads(self, hidden_states, num_heads):
         return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
     def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
     @nn.compact
     # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
     def __call__(
         self,
+        hidden_states,
+        attention_mask,
+        position_ids,
         deterministic: bool = True,
         init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+        query = self._split_heads(query, self.num_heads)
+        key = self._split_heads(key, self.num_key_value_heads)
+        value = self._split_heads(value, self.num_key_value_heads)
+        key, query = self.rotary_emb(key, query, position_ids)
+        query_length, key_length = query.shape[1], key.shape[1]
         if self.has_variable("cache", "cached_key"):
             mask_shift = self.variables["cache"]["cache_index"]
             max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
         batch_size = hidden_states.shape[0]
         causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
         attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
         attention_mask = combine_masks(attention_mask, causal_mask)
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
         if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+        key = jnp.repeat(key, self.num_key_value_groups, axis=2)
+        value = jnp.repeat(value, self.num_key_value_groups, axis=2)
+        # transform boolean mask into float mask
         attention_bias = lax.select(
             attention_mask > 0,
             jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
         # usual dot product attention
         attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
         attn_weights = dot_product_attention_weights(
+            query,
+            key,
             bias=attention_bias,
+            dropout_rng=dropout_rng,
             dropout_rate=self.config.attention_dropout,
+            deterministic=deterministic,
             dtype=attention_dtype,
         )
         if self.attention_softmax_in_fp32:
             attn_weights = attn_weights.astype(self.dtype)
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
         attn_output = self._merge_heads(attn_output)
         attn_output = self.o_proj(attn_output)
         return outputs
+class FlaxCognitivessMLP(nn.Module):
+    config: CognitivessConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        self.act = ACT2FN[self.config.hidden_act]
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+    def __call__(self, hidden_states):
+        up_proj_states = self.up_proj(hidden_states)
+        gate_states = self.act(self.gate_proj(hidden_states))
+        hidden_states = self.down_proj(up_proj_states * gate_states)
+        return hidden_states
 class FlaxCognitivessDecoderLayer(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
         return outputs
 class FlaxCognitivessLayerCollection(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
         return outputs
 class FlaxCognitivessModule(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
 append_call_sample_docstring(
     FlaxCognitivessModel,
     _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutput,
     _CONFIG_FOR_DOC,
     real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )
 class FlaxCognitivessForCausalLMModule(nn.Module):
     config: CognitivessConfig
     dtype: jnp.dtype = jnp.float32
     """,
     Cognitivess_START_DOCSTRING,
 )
 # Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Cognitivess
 class FlaxCognitivessForCausalLM(FlaxCognitivessPreTrainedModel):
     module_class = FlaxCognitivessForCausalLMModule
 append_call_sample_docstring(
     FlaxCognitivessForCausalLM,
     _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
     _CONFIG_FOR_DOC,
     real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )