mkthoma commited on
Commit
97b7e88
·
1 Parent(s): 0a76e00

app update

Browse files
Files changed (1) hide show
  1. app.py +89 -52
app.py CHANGED
@@ -1,6 +1,10 @@
1
  import torch
2
  import torch.nn as nn
3
  from torch.nn import functional as F
 
 
 
 
4
 
5
  # hyperparameters
6
  batch_size = 16 # how many independent sequences will we process in parallel?
@@ -19,48 +23,48 @@ dropout = 0.0
19
  torch.manual_seed(1337)
20
 
21
 
22
- with open('input.txt', 'r', encoding='utf-8') as f:
23
- text = f.read()
24
-
25
- # here are all the unique characters that occur in this text
26
- chars = sorted(list(set(text)))
27
- vocab_size = len(chars)
28
- # create a mapping from characters to integers
29
- stoi = { ch:i for i,ch in enumerate(chars) }
30
- itos = { i:ch for i,ch in enumerate(chars) }
31
- encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
32
- decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
33
-
34
- # Train and test splits
35
- data = torch.tensor(encode(text), dtype=torch.long)
36
- n = int(0.9*len(data)) # first 90% will be train, rest val
37
- train_data = data[:n]
38
- val_data = data[n:]
39
-
40
-
41
- # data loading
42
- def get_batch(split):
43
- # generate a small batch of data of inputs x and targets y
44
- data = train_data if split == 'train' else val_data
45
- ix = torch.randint(len(data) - block_size, (batch_size,))
46
- x = torch.stack([data[i:i+block_size] for i in ix])
47
- y = torch.stack([data[i+1:i+block_size+1] for i in ix])
48
- x, y = x.to(device), y.to(device)
49
- return x, y
50
-
51
- @torch.no_grad()
52
- def estimate_loss():
53
- out = {}
54
- model.eval()
55
- for split in ['train', 'val']:
56
- losses = torch.zeros(eval_iters)
57
- for k in range(eval_iters):
58
- X, Y = get_batch(split)
59
- logits, loss = model(X, Y)
60
- losses[k] = loss.item()
61
- out[split] = losses.mean()
62
- model.train()
63
- return out
64
 
65
  class Head(nn.Module):
66
  """ one head of self-attention """
@@ -134,18 +138,24 @@ class Block(nn.Module):
134
  x = x + self.ffwd(self.ln2(x))
135
  return x
136
 
 
137
  # super simple bigram model
138
  class BigramLanguageModel(nn.Module):
139
-
140
- def __init__(self):
141
  super().__init__()
142
- # each token directly reads off the logits for the next token from a lookup table
143
- self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
 
 
 
 
 
 
144
  self.position_embedding_table = nn.Embedding(block_size, n_embd)
145
  self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
146
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
147
- self.lm_head = nn.Linear(n_embd, vocab_size)
148
-
149
  def forward(self, idx, targets=None):
150
  B, T = idx.shape
151
 
@@ -184,14 +194,42 @@ class BigramLanguageModel(nn.Module):
184
  idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
185
  return idx
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  # Load the shakespeaere model
189
- shakespeare_model = BigramLanguageModel().to(device) # Initialize an instance of your model
190
  shakespeare_model.load_state_dict(torch.load('shakespeaere_language_model.pth', map_location=torch.device('cpu')))
191
  shakespeare_model.eval() # Set the model to evaluation mode
192
 
193
  # Load the wikipedia model
194
- wikipedia_model = BigramLanguageModel().to(device) # Initialize an instance of your model
195
  wikipedia_model.load_state_dict(torch.load('wikipedia_language_model.pth', map_location=torch.device('cpu')))
196
  wikipedia_model.eval() # Set the model to evaluation mode
197
 
@@ -213,7 +251,6 @@ def generate_wikipedia_outputs(prompt=None, max_new_tokens=2000):
213
  text_output = decode(wikipedia_model.generate(context, max_new_tokens=max_new_tokens)[0].tolist())
214
  return text_output
215
 
216
- import gradio as gr
217
 
218
  title = "Nano GPT"
219
  description = "Nano GPT trained on Shakespeare and Wikipedia datasets. It is trained on a very small amount of data to understand how GPT's are trained and built. <a href='https://github.com/karpathy/nanoGPT'>The implementation can be found here </a>"
 
1
  import torch
2
  import torch.nn as nn
3
  from torch.nn import functional as F
4
+ import numpy as np
5
+ import random
6
+ import re
7
+ import gradio as gr
8
 
9
  # hyperparameters
10
  batch_size = 16 # how many independent sequences will we process in parallel?
 
23
  torch.manual_seed(1337)
24
 
25
 
26
+ # with open('input.txt', 'r', encoding='utf-8') as f:
27
+ # text = f.read()
28
+
29
+ # # here are all the unique characters that occur in this text
30
+ # chars = sorted(list(set(text)))
31
+ # vocab_size = len(chars)
32
+ # # create a mapping from characters to integers
33
+ # stoi = { ch:i for i,ch in enumerate(chars) }
34
+ # itos = { i:ch for i,ch in enumerate(chars) }
35
+ # encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
36
+ # decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
37
+
38
+ # # Train and test splits
39
+ # data = torch.tensor(encode(text), dtype=torch.long)
40
+ # n = int(0.9*len(data)) # first 90% will be train, rest val
41
+ # train_data = data[:n]
42
+ # val_data = data[n:]
43
+
44
+
45
+ # # data loading
46
+ # def get_batch(split):
47
+ # # generate a small batch of data of inputs x and targets y
48
+ # data = train_data if split == 'train' else val_data
49
+ # ix = torch.randint(len(data) - block_size, (batch_size,))
50
+ # x = torch.stack([data[i:i+block_size] for i in ix])
51
+ # y = torch.stack([data[i+1:i+block_size+1] for i in ix])
52
+ # x, y = x.to(device), y.to(device)
53
+ # return x, y
54
+
55
+ # @torch.no_grad()
56
+ # def estimate_loss():
57
+ # out = {}
58
+ # model.eval()
59
+ # for split in ['train', 'val']:
60
+ # losses = torch.zeros(eval_iters)
61
+ # for k in range(eval_iters):
62
+ # X, Y = get_batch(split)
63
+ # logits, loss = model(X, Y)
64
+ # losses[k] = loss.item()
65
+ # out[split] = losses.mean()
66
+ # model.train()
67
+ # return out
68
 
69
  class Head(nn.Module):
70
  """ one head of self-attention """
 
138
  x = x + self.ffwd(self.ln2(x))
139
  return x
140
 
141
+ # super simple bigram model
142
  # super simple bigram model
143
  class BigramLanguageModel(nn.Module):
144
+ def __init__(self, dataset_text, n_embd):
 
145
  super().__init__()
146
+
147
+ # Compute character-related parameters
148
+ self.chars = sorted(list(set(dataset_text)))
149
+ self.vocab_size = len(self.chars)
150
+ self.stoi = {ch: i for i, ch in enumerate(self.chars)}
151
+ self.itos = {i: ch for ch, i in self.stoi.items()}
152
+
153
+ self.token_embedding_table = nn.Embedding(self.vocab_size, n_embd)
154
  self.position_embedding_table = nn.Embedding(block_size, n_embd)
155
  self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
156
+ self.ln_f = nn.LayerNorm(n_embd)
157
+ self.lm_head = nn.Linear(n_embd, self.vocab_size)
158
+
159
  def forward(self, idx, targets=None):
160
  B, T = idx.shape
161
 
 
194
  idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
195
  return idx
196
 
197
+ # Reading shakespeare data
198
+ with open('input.txt', 'r', encoding='utf-8') as f:
199
+ shakespeare_text = f.read()
200
+
201
+
202
+ # Reading wikipedia data
203
+ DATA_PATH = 'wikisent2.txt'
204
+ # load wikipedia sentences
205
+ with open(DATA_PATH, 'r') as f:
206
+ lines = f.read().splitlines()
207
+
208
+ # Selecting 250k lines from the dataset.
209
+ random.seed(42)
210
+ texts = random.choices(lines, k=250000)
211
+ del lines
212
+
213
+ def preprocess(text):
214
+ text = re.sub('@.*?\s+', '', text) # Remove mentions
215
+ text = re.sub('#.*?\s+', '', text) # Remove hashtags
216
+ text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) # Remove URLs
217
+ text = re.sub(r'[^\w\s\'.]', '', text) # Remove special characters except for single quotes and periods
218
+ text = re.sub('\s+', ' ', text) # Replace multiple spaces with a single space
219
+ text = re.sub('^\d+\s*|^\d+\.\d+\s*|^\d+\.\d+\.\d+\s*', '', text) # Remove digits at the start of sentences
220
+ text = text.strip() # Remove leading and trailing whitespace
221
+ return text
222
+
223
+ wiki_text = [preprocess(t) for t in texts]
224
+ wiki_text = '\n'.join(wiki_text)
225
 
226
  # Load the shakespeaere model
227
+ shakespeare_model = BigramLanguageModel(shakespeare_text, n_embd).to(device) # Initialize an instance of your model
228
  shakespeare_model.load_state_dict(torch.load('shakespeaere_language_model.pth', map_location=torch.device('cpu')))
229
  shakespeare_model.eval() # Set the model to evaluation mode
230
 
231
  # Load the wikipedia model
232
+ wikipedia_model = BigramLanguageModel(wiki_text, n_embd).to(device) # Initialize an instance of your model
233
  wikipedia_model.load_state_dict(torch.load('wikipedia_language_model.pth', map_location=torch.device('cpu')))
234
  wikipedia_model.eval() # Set the model to evaluation mode
235
 
 
251
  text_output = decode(wikipedia_model.generate(context, max_new_tokens=max_new_tokens)[0].tolist())
252
  return text_output
253
 
 
254
 
255
  title = "Nano GPT"
256
  description = "Nano GPT trained on Shakespeare and Wikipedia datasets. It is trained on a very small amount of data to understand how GPT's are trained and built. <a href='https://github.com/karpathy/nanoGPT'>The implementation can be found here </a>"