NoorAfaqi commited on
Commit
d82b09f
·
verified ·
1 Parent(s): 6be9ef5

Upload 4 files

Browse files
Files changed (4) hide show
  1. HabibiTranslator.ipynb +0 -0
  2. app.py +303 -0
  3. habibi.pth +3 -0
  4. requirements.txt +2 -0
HabibiTranslator.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """HabibiTranslator.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1lYP3XxUCWdiihU0mIejW_KCqTvy7-tz6
8
+ """
9
+
10
+ import torch
11
+ torch.cuda.is_available()
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ import torch.optim as optim
16
+ import math
17
+ from datasets import load_dataset
18
+ import numpy as np
19
+ from collections import Counter
20
+ import gradio as gr
21
+
22
+ # Seting random seed for reproducibility
23
+ torch.manual_seed(42)
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+
26
+ dataset = load_dataset('Helsinki-NLP/tatoeba_mt', 'ara-eng')
27
+
28
+ # tokenization (word-level)
29
+ def tokenize(text):
30
+ return text.split()
31
+
32
+ # Building vocabulary from dataset
33
+ def build_vocab(data, tokenizer, min_freq=2):
34
+ counter = Counter()
35
+ for example in data:
36
+ counter.update(tokenizer(example['sourceString']))
37
+ counter.update(tokenizer(example['targetString']))
38
+ # Adding special tokens
39
+ specials = ['<pad>', '<sos>', '<eos>', '<unk>']
40
+ vocab = specials + [word for word, freq in counter.items() if freq >= min_freq]
41
+ word2idx = {word: idx for idx, word in enumerate(vocab)}
42
+ idx2word = {idx: word for word, idx in word2idx.items()}
43
+ return word2idx, idx2word
44
+
45
+ # Converting text to tensor (adjusted to fit special tokens within max_len)
46
+ def text_to_tensor(text, vocab, tokenizer, max_len=52):
47
+ tokens = tokenizer(text)[:max_len - 2] # Reserving space for <sos> and <eos>
48
+ tokens = ['<sos>'] + tokens + ['<eos>']
49
+ tensor = [vocab.get(token, vocab['<unk>']) for token in tokens]
50
+ return torch.tensor(tensor, dtype=torch.long)
51
+
52
+ train_data = dataset['validation'] # Using validation as training data for demo
53
+ test_data = dataset['test']
54
+
55
+ # Building shared vocabulary (for simplicity, using both languages in one vocab)
56
+ word2idx, idx2word = build_vocab(train_data, tokenize)
57
+
58
+ # Hyperparameters for data
59
+ max_len = 52 # Increased to account for <sos> and <eos>
60
+ batch_size = 32
61
+
62
+ train_data_list = list(train_data) # Convert Dataset to list once
63
+ print(f"Length of train_data_list: {len(train_data_list)}")
64
+
65
+ def get_batches(data_list, batch_size, max_len=52):
66
+ total_batches = len(data_list) // batch_size + (1 if len(data_list) % batch_size else 0)
67
+ print(f"Total batches to process: {total_batches}")
68
+ for i in range(0, len(data_list), batch_size):
69
+ batch = data_list[i:i + batch_size]
70
+ src_batch = [text_to_tensor(example['sourceString'], word2idx, tokenize, max_len) for example in batch]
71
+ tgt_batch = [text_to_tensor(example['targetString'], word2idx, tokenize, max_len) for example in batch]
72
+ src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=word2idx['<pad>'], batch_first=False).to(device)
73
+ tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=word2idx['<pad>'], batch_first=False).to(device)
74
+ if src_batch.size(0) > max_len:
75
+ src_batch = src_batch[:max_len, :]
76
+ elif src_batch.size(0) < max_len:
77
+ padding = torch.full((max_len - src_batch.size(0), src_batch.size(1)), word2idx['<pad>'], dtype=torch.long).to(device)
78
+ src_batch = torch.cat([src_batch, padding], dim=0)
79
+ if tgt_batch.size(0) > max_len:
80
+ tgt_batch = tgt_batch[:max_len, :]
81
+ elif tgt_batch.size(0) < max_len:
82
+ padding = torch.full((max_len - tgt_batch.size(0), tgt_batch.size(1)), word2idx['<pad>'], dtype=torch.long).to(device)
83
+ tgt_batch = torch.cat([tgt_batch, padding], dim=0)
84
+ src_batch = src_batch.transpose(0, 1) # [batch_size, seq_len]
85
+ tgt_batch = tgt_batch.transpose(0, 1) # [batch_size, seq_len]
86
+ yield src_batch, tgt_batch
87
+
88
+
89
+ print("Revised Chunk 1 (Seventh Iteration) completed: Dataset loaded and preprocessing debugged.")
90
+
91
+ class PositionalEncoding(nn.Module):
92
+ def __init__(self, d_model, max_len=52):
93
+ super().__init__()
94
+ pe = torch.zeros(max_len, d_model)
95
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
96
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
97
+ pe[:, 0::2] = torch.sin(position * div_term)
98
+ pe[:, 1::2] = torch.cos(position * div_term)
99
+ pe = pe.unsqueeze(0) # Shape: (1, max_len, d_model)
100
+ self.register_buffer('pe', pe)
101
+
102
+ def forward(self, x):
103
+ return x + self.pe[:, :x.size(1), :]
104
+
105
+ class MultiHeadAttention(nn.Module):
106
+ def __init__(self, d_model, num_heads):
107
+ super().__init__()
108
+ assert d_model % num_heads == 0
109
+ self.d_model = d_model
110
+ self.num_heads = num_heads
111
+ self.d_k = d_model // num_heads
112
+ self.W_q = nn.Linear(d_model, d_model)
113
+ self.W_k = nn.Linear(d_model, d_model)
114
+ self.W_v = nn.Linear(d_model, d_model)
115
+ self.W_o = nn.Linear(d_model, d_model)
116
+
117
+ def scaled_dot_product_attention(self, Q, K, V, mask=None):
118
+ scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
119
+ if mask is not None:
120
+ scores = scores.masked_fill(mask == 0, -1e9)
121
+ attn = torch.softmax(scores, dim=-1)
122
+ return torch.matmul(attn, V)
123
+
124
+ def forward(self, Q, K, V, mask=None):
125
+ batch_size = Q.size(0)
126
+ seq_len_q = Q.size(1)
127
+ seq_len_k = K.size(1)
128
+ Q = self.W_q(Q)
129
+ K = self.W_k(K)
130
+ V = self.W_v(V)
131
+ Q = Q.view(batch_size, seq_len_q, self.num_heads, self.d_k).transpose(1, 2)
132
+ K = K.view(batch_size, seq_len_k, self.num_heads, self.d_k).transpose(1, 2)
133
+ V = V.view(batch_size, seq_len_k, self.num_heads, self.d_k).transpose(1, 2)
134
+ output = self.scaled_dot_product_attention(Q, K, V, mask)
135
+ output = output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, self.d_model)
136
+ return self.W_o(output)
137
+
138
+ class FeedForward(nn.Module):
139
+ def __init__(self, d_model, d_ff):
140
+ super().__init__()
141
+ self.linear1 = nn.Linear(d_model, d_ff)
142
+ self.linear2 = nn.Linear(d_ff, d_model)
143
+ self.relu = nn.ReLU()
144
+
145
+ def forward(self, x):
146
+ return self.linear2(self.relu(self.linear1(x)))
147
+
148
+ class EncoderLayer(nn.Module):
149
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
150
+ super().__init__()
151
+ self.mha = MultiHeadAttention(d_model, num_heads)
152
+ self.ff = FeedForward(d_model, d_ff)
153
+ self.norm1 = nn.LayerNorm(d_model)
154
+ self.norm2 = nn.LayerNorm(d_model)
155
+ self.dropout = nn.Dropout(dropout)
156
+
157
+ def forward(self, x, mask=None):
158
+ attn_output = self.mha(x, x, x, mask)
159
+ x = self.norm1(x + self.dropout(attn_output))
160
+ ff_output = self.ff(x)
161
+ return self.norm2(x + self.dropout(ff_output))
162
+
163
+ class DecoderLayer(nn.Module):
164
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
165
+ super().__init__()
166
+ self.mha1 = MultiHeadAttention(d_model, num_heads)
167
+ self.mha2 = MultiHeadAttention(d_model, num_heads)
168
+ self.ff = FeedForward(d_model, d_ff)
169
+ self.norm1 = nn.LayerNorm(d_model)
170
+ self.norm2 = nn.LayerNorm(d_model)
171
+ self.norm3 = nn.LayerNorm(d_model)
172
+ self.dropout = nn.Dropout(dropout)
173
+
174
+ def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
175
+ attn1_output = self.mha1(x, x, x, tgt_mask)
176
+ x = self.norm1(x + self.dropout(attn1_output))
177
+ attn2_output = self.mha2(x, enc_output, enc_output, src_mask)
178
+ x = self.norm2(x + self.dropout(attn2_output))
179
+ ff_output = self.ff(x)
180
+ return self.norm3(x + self.dropout(ff_output))
181
+
182
+ class Transformer(nn.Module):
183
+ def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, num_heads=8, num_layers=3, d_ff=1024, max_len=52, dropout=0.1):
184
+ super().__init__()
185
+ self.d_model = d_model
186
+ self.src_embedding = nn.Embedding(src_vocab_size, d_model)
187
+ self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
188
+ self.pos_encoding = PositionalEncoding(d_model, max_len)
189
+ self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
190
+ self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
191
+ self.fc_out = nn.Linear(d_model, tgt_vocab_size)
192
+ self.dropout = nn.Dropout(dropout)
193
+
194
+ def generate_mask(self, src, tgt):
195
+ src_mask = (src != word2idx['<pad>']).unsqueeze(1).unsqueeze(2)
196
+ tgt_mask = (tgt != word2idx['<pad>']).unsqueeze(1).unsqueeze(3)
197
+ seq_len = tgt.size(1)
198
+ nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool().to(device)
199
+ tgt_mask = tgt_mask & nopeak_mask
200
+ return src_mask, tgt_mask
201
+
202
+ def forward(self, src, tgt):
203
+ src_mask, tgt_mask = self.generate_mask(src, tgt)
204
+ src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
205
+ tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))
206
+
207
+ enc_output = src_embedded
208
+ for enc_layer in self.encoder_layers:
209
+ enc_output = enc_layer(enc_output, src_mask)
210
+
211
+ dec_output = tgt_embedded
212
+ for dec_layer in self.decoder_layers:
213
+ dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
214
+
215
+ return self.fc_out(dec_output)
216
+
217
+ print("Revised Chunk 2 (Fourth Iteration) completed: Transformer model fixed with max_len=52.")
218
+
219
+ vocab_size = len(word2idx)
220
+ model = Transformer(
221
+ src_vocab_size=vocab_size,
222
+ tgt_vocab_size=vocab_size,
223
+ d_model=256,
224
+ num_heads=8,
225
+ num_layers=3,
226
+ d_ff=1024,
227
+ max_len=52,
228
+ dropout=0.1
229
+ ).to(device)
230
+
231
+ # Loss and optimizer
232
+ criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])
233
+ optimizer = optim.Adam(model.parameters(), lr=0.0001)
234
+
235
+ # Training loop with progress feedback
236
+ def train(model, data, epochs=20):
237
+ model.train()
238
+ total_batches = len(data) // batch_size + (1 if len(data) % batch_size else 0)
239
+ print(f"Total batches per epoch: {total_batches}")
240
+ for epoch in range(epochs):
241
+ total_loss = 0
242
+ for batch_idx, (src_batch, tgt_batch) in enumerate(get_batches(data, batch_size, max_len=52), 1):
243
+ if batch_idx % 100 == 0: # Printing every 100 batches for feedback
244
+ print(f"Epoch {epoch + 1}, Batch {batch_idx}/{total_batches} ")
245
+ optimizer.zero_grad()
246
+ output = model(src_batch, tgt_batch[:, :-1])
247
+ loss = criterion(output.view(-1, vocab_size), tgt_batch[:, 1:].reshape(-1))
248
+ loss.backward()
249
+ optimizer.step()
250
+ total_loss += loss.item()
251
+ avg_loss = total_loss / total_batches
252
+ print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
253
+
254
+ # Main function
255
+ def translate(model, sentence, max_len=52):
256
+ model.eval()
257
+ with torch.no_grad():
258
+ src = text_to_tensor(sentence, word2idx, tokenize, max_len).unsqueeze(0).to(device)
259
+ tgt = torch.tensor([word2idx['<sos>']], dtype=torch.long).unsqueeze(0).to(device)
260
+ for _ in range(max_len):
261
+ output = model(src, tgt)
262
+ next_token = output[:, -1, :].argmax(dim=-1).item()
263
+ if next_token == word2idx['<eos>']:
264
+ break
265
+ tgt = torch.cat([tgt, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)
266
+ translated = [idx2word[idx.item()] for idx in tgt[0] if idx.item() in idx2word]
267
+ return ' '.join(translated[1:])
268
+
269
+
270
+ # Testing
271
+ test_sentence = "عمرك رايح المكسيك؟"
272
+ translated = translate(model, test_sentence)
273
+ print(f"Input: {test_sentence}")
274
+ print(f"Translated: {translated}")
275
+
276
+ print("Chunk 3 completed: Training and inference implemented.")
277
+
278
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
279
+
280
+ # Instantiate the model (assuming train_dataset is already defined)
281
+ model = Transformer(
282
+ src_vocab_size=vocab_size,
283
+ tgt_vocab_size=vocab_size
284
+ ).to(device)
285
+
286
+ # Load model checkpoint and set to evaluation mode
287
+ model.load_state_dict(torch.load("habibi.pth", map_location=device))
288
+ model.eval()
289
+
290
+ def gradio_translate(text):
291
+ return translate(model, text)
292
+
293
+ interface = gr.Interface(
294
+ fn=gradio_translate,
295
+ inputs=gr.Textbox(lines=2, placeholder="Enter Arabic sentence here..."),
296
+ outputs="text",
297
+ title="Arabic to English Translator",
298
+ description="Translate Arabic sentences to English using a Transformer model."
299
+ )
300
+
301
+ interface.launch()
302
+
303
+ print("Chunk 4 completed: Gradio interface deployed.")
habibi.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4b5462a685ebcc69e93a2c568a049190f3cb6d52d13da51e59fbf5098bcb9e6
3
+ size 69375926
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ torchvision