DanielHesslow commited on
Commit
ee739e3
·
1 Parent(s): e283837
Files changed (4) hide show
  1. config.json +4 -13
  2. pytorch_model.bin +2 -2
  3. rita_configuration.py +5 -8
  4. rita_modeling.py +4 -5
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "Seledorn/RITA_s",
3
  "architectures": [
4
  "RITAModel"
5
  ],
@@ -8,24 +8,15 @@
8
  "AutoModel": "rita_modeling.RITAModel",
9
  "AutoModelForCausalLM": "rita_modeling.RITAModel"
10
  },
11
- "bos_token_id": [
12
- [
13
- [
14
- [
15
- 50256
16
- ]
17
- ]
18
- ]
19
- ],
20
  "d_feedforward": 3072,
21
  "d_model": 768,
22
  "dropout": 0.0,
23
- "eos_token_id": 50256,
24
  "max_seq_len": 1024,
25
- "model_type": "codegen",
26
  "num_heads": 12,
27
  "num_layers": 12,
28
  "torch_dtype": "float32",
29
  "transformers_version": "4.18.0",
30
- "vocab_size": 128
31
  }
 
1
  {
2
+ "_name_or_path": "nz/RITA_s",
3
  "architectures": [
4
  "RITAModel"
5
  ],
 
8
  "AutoModel": "rita_modeling.RITAModel",
9
  "AutoModelForCausalLM": "rita_modeling.RITAModel"
10
  },
 
 
 
 
 
 
 
 
 
11
  "d_feedforward": 3072,
12
  "d_model": 768,
13
  "dropout": 0.0,
14
+ "eos_token_id": 2,
15
  "max_seq_len": 1024,
16
+ "model_type": "rita",
17
  "num_heads": 12,
18
  "num_layers": 12,
19
  "torch_dtype": "float32",
20
  "transformers_version": "4.18.0",
21
+ "vocab_size": 26
22
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f550205b710fd115dfe670923f39793821dd05609f1ee7eb24f1315076b0a61
3
- size 340681123
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc502eec97680f2dbefb38131ea4e2aa465bceb7f4bcdaaee0358726a21e361
3
+ size 340367779
rita_configuration.py CHANGED
@@ -1,26 +1,24 @@
1
-
2
  from transformers.configuration_utils import PretrainedConfig
3
  from transformers.utils import logging
4
 
5
  logger = logging.get_logger(__name__)
6
 
7
  class RITAConfig(PretrainedConfig):
8
- model_type = "codegen"
9
 
10
  def __init__(
11
  self,
12
- vocab_size=128,
13
  d_model=768,
14
  num_layers=12,
15
  max_seq_len=1024,
16
  num_heads=12,
17
  dropout=0.,
18
  ff_ratio=4,
19
- bos_token_id=50256, # TODO
20
- eos_token_id=50256, # TODO
21
  **kwargs,
22
  ):
23
- super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
24
  self.vocab_size = vocab_size
25
  self.d_model = d_model
26
  self.num_heads = num_heads
@@ -28,5 +26,4 @@ class RITAConfig(PretrainedConfig):
28
  self.num_layers = num_layers
29
  self.max_seq_len=max_seq_len
30
  self.dropout = dropout
31
- self.bos_token_id=bos_token_id,
32
- self.eos_token_id=eos_token_id
 
 
1
  from transformers.configuration_utils import PretrainedConfig
2
  from transformers.utils import logging
3
 
4
  logger = logging.get_logger(__name__)
5
 
6
  class RITAConfig(PretrainedConfig):
7
+ model_type = "rita"
8
 
9
  def __init__(
10
  self,
11
+ vocab_size=26,
12
  d_model=768,
13
  num_layers=12,
14
  max_seq_len=1024,
15
  num_heads=12,
16
  dropout=0.,
17
  ff_ratio=4,
18
+ eos_token_id=2,
 
19
  **kwargs,
20
  ):
21
+ super().__init__(eos_token_id=eos_token_id, **kwargs)
22
  self.vocab_size = vocab_size
23
  self.d_model = d_model
24
  self.num_heads = num_heads
 
26
  self.num_layers = num_layers
27
  self.max_seq_len=max_seq_len
28
  self.dropout = dropout
29
+ self.eos_token_id=eos_token_id
 
rita_modeling.py CHANGED
@@ -222,10 +222,10 @@ class RITAModel(PreTrainedModel):
222
  self.final_norm = nn.LayerNorm(config.d_model)
223
  self.projector = nn.Linear(config.d_model, config.vocab_size, bias = False)
224
 
225
- def forward(self, ids, attn_mask=None, padding_mask=None, return_hidden=False) -> torch.FloatTensor:
226
- x = self.embedding(ids) # N x L x D
227
  if attn_mask == None:
228
- attn_mask = (torch.triu(torch.ones(ids.size(1), ids.size(1))) == 0).transpose(0, 1).contiguous()
229
  for layer in self.layers:
230
  x = layer(x, attn_mask=attn_mask, padding_mask=padding_mask)
231
  x = self.final_norm(x) # N x L x D
@@ -246,5 +246,4 @@ class RITAModel(PreTrainedModel):
246
  return self.projector
247
 
248
  def set_output_embeddings(self, new_projector):
249
- return new_projector
250
-
 
222
  self.final_norm = nn.LayerNorm(config.d_model)
223
  self.projector = nn.Linear(config.d_model, config.vocab_size, bias = False)
224
 
225
+ def forward(self, input_ids, attn_mask=None, padding_mask=None, return_hidden=False) -> torch.FloatTensor:
226
+ x = self.embedding(input_ids) # N x L x D
227
  if attn_mask == None:
228
+ attn_mask = (torch.triu(torch.ones(input_ids.size(1), input_ids.size(1))) == 0).transpose(0, 1).contiguous().to(input_ids.device)
229
  for layer in self.layers:
230
  x = layer(x, attn_mask=attn_mask, padding_mask=padding_mask)
231
  x = self.final_norm(x) # N x L x D
 
246
  return self.projector
247
 
248
  def set_output_embeddings(self, new_projector):
249
+ self.projector = new_projector