ubaada commited on
Commit
6c5f117
1 Parent(s): 1310c3d

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "auto_map": {
4
+ "AutoModel": "modeling_original_transformer.OrginalTransformer",
5
+ "AutoModelForSeq2SeqLM": "modeling_original_transformer.OrginalTransformer"
6
+ },
7
+ "bos_token_id": 1,
8
+ "d_ff": 2048,
9
+ "dec_vocab_size": 37000,
10
+ "dropout": 0,
11
+ "embed_dim": 512,
12
+ "enc_vocab_size": 37000,
13
+ "eos_token_id": 2,
14
+ "is_encoder_decoder": true,
15
+ "model_type": "original_transformer",
16
+ "num_dec_layers": 6,
17
+ "num_enc_layers": 6,
18
+ "num_heads": 8,
19
+ "pad_token_id": 0,
20
+ "transformers_version": "4.46.1"
21
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+
3
+ "pad_token_id": 0,
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "do_sample": true,
7
+ "top_k": 20,
8
+ "top_p": 0.6,
9
+ "num_beams": 5,
10
+ "repetition_penalty": 1.05,
11
+ "temperature": 0.7,
12
+ "transformers_version": "4.31.0",
13
+ "max_length": 512
14
+ }
modeling_original_transformer.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
4
+ # Seq2SeqLMOutput for model forward, BaseModelOutputWithPastAndCrossAttentions for encoder forward
5
+ from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutputWithPastAndCrossAttentions
6
+ from transformers import AutoConfig, AutoModel, AutoModelForSeq2SeqLM
7
+ from torch.nn.utils.rnn import pad_sequence
8
+
9
+
10
+ _MAX_CONTEXT_SIZE = 10_000
11
+
12
+ # ==========================================================
13
+ # Config
14
+ # ==========================================================
15
+ class OriginalTransformerConfig(PretrainedConfig):
16
+ model_type = "original_transformer"
17
+
18
+ def __init__(
19
+ self,
20
+ num_enc_layers = 6,
21
+ num_dec_layers = 6,
22
+ embed_dim = 512,
23
+ num_heads = 8,
24
+ enc_vocab_size = 37000,
25
+ dec_vocab_size = 37000,
26
+ d_ff = 2048,
27
+ dropout=0,
28
+ pad_token_id = 0,
29
+ bos_token_id = 1,
30
+ eos_token_id = 2,
31
+ is_encoder_decoder=True,
32
+ **kwargs
33
+ ):
34
+ super().__init__(**kwargs)
35
+ self.num_enc_layers = num_enc_layers
36
+ self.num_dec_layers = num_dec_layers
37
+ self.embed_dim = embed_dim
38
+ self.num_heads = num_heads
39
+ self.enc_vocab_size = enc_vocab_size
40
+ self.dec_vocab_size = dec_vocab_size
41
+ self.d_ff = d_ff
42
+ self.dropout = dropout
43
+ self.pad_token_id = pad_token_id
44
+ self.bos_token_id = bos_token_id
45
+ self.eos_token_id = eos_token_id
46
+
47
+ self.is_encoder_decoder = is_encoder_decoder
48
+
49
+ # for using AutoModel.from_pretrained
50
+ self.auto_map = {
51
+ "AutoModel": "modeling_original_transformer.OrginalTransformer",
52
+ "AutoModelForSeq2SeqLM": "modeling_original_transformer.OrginalTransformer"
53
+ }
54
+
55
+
56
+
57
+
58
+ # ==========================================================
59
+ # Model
60
+ # ==========================================================
61
+
62
+
63
+
64
+
65
+ # combines both embedding and pos_encoding
66
+ class Embed(nn.Module):
67
+ def __init__(self, vocab_size, embed_dim, dropout=0):
68
+ super().__init__()
69
+ self.emb_factor = torch.sqrt(torch.tensor(embed_dim, dtype=torch.float32))
70
+ self.embed = nn.Embedding(vocab_size, embed_dim) # vocab x C
71
+ self.dropout = nn.Dropout(dropout)
72
+
73
+ pos_embed = torch.zeros(_MAX_CONTEXT_SIZE, embed_dim) # T x C
74
+ position = torch.arange(0, _MAX_CONTEXT_SIZE).unsqueeze(1) # FROM 1 x T to T x 1
75
+ # P.E(pos,2i) = sin(pos/10000^(2i/dim))
76
+
77
+ # div_term = 10000 ^([0,1,2,...,C/2-1] * 2/C) <--
78
+ div_term = torch.pow(10_000.0, torch.arange(0, embed_dim//2) * 2/embed_dim) # 1 x C/2 (Embed_dim/2)
79
+
80
+ pos_embed[:, 0::2] = torch.sin(position / div_term) # T x C/2 ((T x 1) / (1 x C/2) = T x C/2 broadcasted)
81
+ pos_embed[:, 1::2] = torch.cos(position / div_term) # T x C/2
82
+
83
+ self.register_buffer('pos_embed', pos_embed, persistent=False)
84
+
85
+
86
+
87
+ def forward(self,x):
88
+ # x = B x T (NOT 1-hot)
89
+ embed_x = self.embed(x) # B T C
90
+ embed_x = embed_x * self.emb_factor # presumably to not be overpowered by the positional encoding
91
+
92
+ # ================================
93
+ # For variable length
94
+ # ===============================
95
+ seq_len = x.shape[-1] # length of T
96
+ truc_pos_embed = self.pos_embed[:seq_len,:]
97
+ embed_x = self.dropout(embed_x + truc_pos_embed)
98
+
99
+ return embed_x
100
+
101
+ class MultiHeadAttention(nn.Module):
102
+ def __init__(self, embed_dim, num_heads, causal_mask = False, bias=True):
103
+ super().__init__()
104
+ self.dk = embed_dim // num_heads
105
+ self.causal_mask = causal_mask
106
+ self.combined_projection_q = nn.Linear(embed_dim,embed_dim, bias=bias)
107
+ self.combined_projection_k = nn.Linear(embed_dim,embed_dim, bias=bias)
108
+ self.combined_projection_v = nn.Linear(embed_dim,embed_dim, bias=bias)
109
+ self.num_heads = num_heads
110
+ self.multi_linear = nn.Linear(embed_dim,embed_dim, bias=bias)
111
+
112
+ def attention(self,q,k,v, padding_mask = None):
113
+ # input shape is B x h x T x dk
114
+ output = (q @ k.transpose(-2,-1)) / torch.sqrt(torch.tensor(self.dk)) # QKt/(sqrt(dk))
115
+
116
+ #apply mask in decoder layer
117
+ if self.causal_mask == True:
118
+ seq_len = q.shape[-2]
119
+ mask = torch.triu(torch.full((seq_len,seq_len), fill_value=-torch.inf,device=q.device), diagonal=1)
120
+ #mask = torch.triu(torch.full((seq_len,seq_len), fill_value=-torch.inf), diagonal=1)
121
+ #mask = mask.to(q.device)
122
+ output = output + mask
123
+
124
+ # apply padding mask in encoder self-attention and decoder cross-attention
125
+ if padding_mask is not None:
126
+ padding_mask = torch.tensor(padding_mask).unsqueeze(1).unsqueeze(1) # B x 1 x 1 x T (broadcasting)
127
+ padding_mask = torch.where(padding_mask == 0, -torch.inf, padding_mask) # -inf turns to 0
128
+ output = output + padding_mask
129
+
130
+
131
+ output = torch.softmax(output, -1)
132
+ output = output @ v
133
+ return output
134
+
135
+ def forward(self,x_q,x_k,x_v, padding_mask = None):
136
+ # combined projection, TxC @ CxC
137
+ # Equivalent to doing Txhead @ CxC over all heads
138
+ p_q = self.combined_projection_q(x_q)
139
+ p_k = self.combined_projection_k(x_k)
140
+ p_v = self.combined_projection_v(x_v)
141
+
142
+ # For each of QKV. [B=Batch, T=Time, C=Channels, h=Heads, dk= head dim]
143
+
144
+ # ========================|======================
145
+ # Split | Combine
146
+ # ========================|======================
147
+ # | B T C /\
148
+ # | <view> | <view> |
149
+ # | B T h dk |
150
+ # | <transpose> | <transpose> |
151
+ # \/ B h T dk |
152
+ # |
153
+ # <attn>
154
+ # ===============================================
155
+
156
+
157
+ B = p_q.shape[0]
158
+ def split_heads(p):
159
+ return p.view(B,-1,self.num_heads,self.dk).transpose(1,2)
160
+
161
+ p_q = split_heads(p_q)
162
+ p_k = split_heads(p_k)
163
+ p_v = split_heads(p_v)
164
+
165
+ output = self.attention(p_q,p_k,p_v, padding_mask=padding_mask)
166
+
167
+ def combine_heads(p):
168
+ return p.transpose(1,2).contiguous().view(B,-1,self.dk*self.num_heads)
169
+
170
+ output = combine_heads(output)
171
+ output = self.multi_linear(output)
172
+ return output
173
+
174
+ # This layer is slightly different from standard linear
175
+ class PointwiseFeedForward(nn.Module):
176
+ def __init__(self, embed_dim, d_ff):
177
+ super(PointwiseFeedForward, self).__init__()
178
+ self.linear1 = nn.Linear(embed_dim, d_ff, bias=True)
179
+ self.linear2 = nn.Linear(d_ff, embed_dim, bias=True)
180
+ def forward(self, x):
181
+ return self.linear2(nn.functional.relu(self.linear1(x)))
182
+
183
+ class EncoderLayer(nn.Module):
184
+ def __init__(self, embed_dim, num_heads, d_ff,dropout=0):
185
+ super().__init__()
186
+ # self attention
187
+ self.m_att = MultiHeadAttention(embed_dim, num_heads)
188
+ self.att_norm = nn.LayerNorm(embed_dim)
189
+ self.dropout1 = nn.Dropout(dropout)
190
+
191
+ # pointwise feedforward module
192
+ self.pwlinear = PointwiseFeedForward(embed_dim, d_ff)
193
+ self.lin_norm = nn.LayerNorm(embed_dim)
194
+ self.dropout2 = nn.Dropout(dropout)
195
+ def forward(self, x, padding_mask = None):
196
+ output = self.att_norm(x + self.dropout1(self.m_att(x,x,x, padding_mask=padding_mask)))
197
+ output = self.lin_norm(output + self.dropout2(self.pwlinear(output)))
198
+ return output
199
+
200
+ class EncoderStack(nn.Module):
201
+ def __init__(self, embed_dim, num_heads, num_layers, d_ff, dropout=0, bos_token_id=1, eos_token_id=2, pad_token_id=0):
202
+ super().__init__()
203
+ self.layers = nn.ModuleList([EncoderLayer(embed_dim, num_heads, d_ff, dropout) for i in range(num_layers)])
204
+
205
+ self.bos_token_id = bos_token_id
206
+ self.eos_token_id = eos_token_id
207
+ self.pad_token_id = pad_token_id
208
+
209
+ def add_bos_eos(self, input_ids):
210
+
211
+ modified_input_ids = []
212
+ for seq in input_ids: # iterate through each batch element
213
+ # Prepend BOS token if needed
214
+ if seq[0] != self.bos_token_id:
215
+ seq = torch.cat([torch.tensor([self.bos_token_id], device=seq.device), seq])
216
+ # Append EOS token if needed
217
+ if seq[-1] != self.eos_token_id:
218
+ seq = torch.cat([seq, torch.tensor([self.eos_token_id], device=seq.device)])
219
+ modified_input_ids.append(seq)
220
+ # Pad sequences to the same length
221
+ padded_input_ids = pad_sequence(modified_input_ids, batch_first=True, padding_value=self.pad_token_id)
222
+ return padded_input_ids
223
+
224
+ # For huggingface compatibility, input_embeds are calculated inside encoder.
225
+ # So encoder must handle both input_ids and input_embeds
226
+ # Will use parent's embed layer. Can't transfer emb layer to encoder without breaking saved checkpoints.
227
+ def forward(self, input_embeds=None, input_ids=None, padding_mask = None, **kwargs):
228
+
229
+ input_ids = self.add_bos_eos(input_ids) # add bos and eos tokens if absent
230
+
231
+ if input_embeds is None:
232
+ input_embeds = self.emb(input_ids)
233
+
234
+ i = 0 # for debugging
235
+ for layer in self.layers:
236
+ input_embeds = layer(input_embeds, padding_mask = padding_mask)
237
+
238
+ return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=input_embeds, hidden_states=None, attentions=None)
239
+
240
+ class DecoderLayer(nn.Module):
241
+ def __init__(self, embed_dim, num_heads, d_ff,dropout=0):
242
+ super().__init__()
243
+ # self causal mask attention module
244
+ self.m_att = MultiHeadAttention(embed_dim, num_heads, causal_mask=True)
245
+ self.att_norm = nn.LayerNorm(embed_dim)
246
+ self.dropout1 = nn.Dropout(dropout)
247
+
248
+ # additional cross attention module
249
+ self.cross_att = MultiHeadAttention(embed_dim, num_heads, causal_mask=False)
250
+ self.cross_att_norm = nn.LayerNorm(embed_dim)
251
+ self.dropout2 = nn.Dropout(dropout)
252
+
253
+ # pointwise feedforward module with its layer norm
254
+ self.pwlinear = PointwiseFeedForward(embed_dim, d_ff)
255
+ self.lin_norm = nn.LayerNorm(embed_dim)
256
+ self.dropout3 = nn.Dropout(dropout)
257
+ def forward(self, x, enc_out, enc_padding_mask = None):
258
+ output = self.att_norm(x + self.dropout1(self.m_att(x,x,x))) # self attention
259
+ output = self.cross_att_norm(output + self.dropout2(self.cross_att(output, enc_out,enc_out, padding_mask=enc_padding_mask))) # cross attention
260
+ output = self.lin_norm(output + self.dropout3(self.pwlinear(output))) # pointwise feedforward
261
+
262
+ return output
263
+
264
+ class DecoderStack(nn.Module):
265
+ def __init__(self, embed_dim, num_heads, num_layers, d_ff,dropout=0):
266
+ super().__init__()
267
+ self.layers = nn.ModuleList([DecoderLayer(embed_dim, num_heads, d_ff,dropout) for i in range(num_layers)])
268
+ def forward(self, x, enc_out, enc_padding_mask = None):
269
+ for layer in self.layers:
270
+ x = layer(x, enc_out, enc_padding_mask)
271
+ return x
272
+
273
+ class OrginalTransformer(PreTrainedModel, GenerationMixin):
274
+ config_class = OriginalTransformerConfig
275
+
276
+
277
+ def __init__(self, config):
278
+ super().__init__(config)
279
+ self.emb = Embed(config.enc_vocab_size, config.embed_dim) # one embedding for both encoder and decoder
280
+
281
+ self.enc = EncoderStack(config.embed_dim, config.num_heads, config.num_enc_layers, config.d_ff, config.dropout,
282
+ config.bos_token_id, config.eos_token_id, config.pad_token_id)
283
+ self.dec = DecoderStack(config.embed_dim, config.num_heads, config.num_dec_layers, config.d_ff, config.dropout)
284
+
285
+ self.last_lin = nn.Linear(config.embed_dim, config.dec_vocab_size, bias=False) # bias false we're tying its weights with the embedding layer
286
+ self.last_lin.weight = self.emb.embed.weight # tying weights
287
+
288
+ # for accessing emb from inside encoder and decoder (for HF)
289
+ self.enc.emb = self.emb
290
+ self.dec.emb = self.emb
291
+
292
+ # huggingface compabile forward
293
+ def forward(self, input_ids= None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None,
294
+ head_mask=None, decoder_head_mask=None, encoder_outputs=None, past_key_values=None, use_cache=None,
295
+ output_hidden_states=None, token_type_ids=None, inputs_embeds=None, labels=None, **kwargs):
296
+
297
+ # Encoder
298
+ # Dont actually need this. Encoder automatically called by .generate() method.
299
+ if encoder_outputs is None:
300
+ encoder_outputs = self.enc(self.emb(input_ids), None) # Encoder
301
+
302
+ # Decoder
303
+ # generate() calls the model with decoder_input_ids
304
+ dec_out = self.dec(self.emb(decoder_input_ids), encoder_outputs.last_hidden_state, None)
305
+ logits = self.last_lin(dec_out)
306
+ output = Seq2SeqLMOutput(logits=logits, encoder_last_hidden_state=encoder_outputs)
307
+ return output
308
+
309
+ def get_encoder(self):
310
+ return self.enc
311
+
312
+ def get_decoder(self):
313
+ return self.dec
314
+
315
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69c36d6f8595a018c623b4b9deb9ac381fce33ef0d8ab83002d5fe405a530dca
3
+ size 252427525
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "BertTokenizer",
3
+ "do_lower_case": false,
4
+ "strip_accents": false,
5
+ "max_len": 512,
6
+ "init_inputs": [],
7
+ "model_max_length": 512,
8
+ "special_tokens_map_file": null,
9
+ "tokenizer_file": "tokenizer.json",
10
+ "name_or_path": "",
11
+ "tokenizer_type": "BPE"
12
+ }