rhysjones
/

gpt2-774M-fineweb-150B

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

rhysjones commited on Jun 16, 2024

Commit

3f7726c

·

verified ·

1 Parent(s): f2bac03

Upload folder using huggingface_hub

Files changed (2) hide show

config.json +2 -1
modeling_gpt2.py +4 -4

config.json CHANGED Viewed

@@ -32,5 +32,6 @@
   "torch_dtype": "bfloat16",
   "transformers_version": "4.41.2",
   "use_cache": true,
-  "vocab_size": 50257
 }

   "torch_dtype": "bfloat16",
   "transformers_version": "4.41.2",
   "use_cache": true,
+  "vocab_size": 50257,
+  "#": {"_attn_implementation": "flash_attention_2"}
 }

modeling_gpt2.py CHANGED Viewed

@@ -171,8 +171,8 @@ class GPT2Attention(nn.Module):
             self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
-        # rhys101 do attention in float32 if model in bfloat16 ?
-        if self.config.torch_dtype == torch.bfloat16:
             self.c_attn = self.c_attn.to(torch.float32)
             self.c_proj = self.c_proj.to(torch.float32)
@@ -315,8 +315,8 @@ class GPT2Attention(nn.Module):
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        # rhys101 do attention in float32 if model in bfloat16 ?
-        if self.config.torch_dtype == torch.bfloat16:
             hidden_states = hidden_states.to(torch.float32)
         if encoder_hidden_states is not None:

             self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+        # rhys101 do non flash attention in float32 if model in bfloat16 ?
+        if self.config._attn_implementation == 'eager' and self.config.torch_dtype == torch.bfloat16:
             self.c_attn = self.c_attn.to(torch.float32)
             self.c_proj = self.c_proj.to(torch.float32)
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        # rhys101 do non flash attention in float32 if model in bfloat16 ?
+        if self.config._attn_implementation == 'eager' and self.config.torch_dtype == torch.bfloat16:
             hidden_states = hidden_states.to(torch.float32)
         if encoder_hidden_states is not None: