app update
Browse files
app.py
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
from torch.nn import functional as F
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# hyperparameters
|
6 |
batch_size = 16 # how many independent sequences will we process in parallel?
|
@@ -19,48 +23,48 @@ dropout = 0.0
|
|
19 |
torch.manual_seed(1337)
|
20 |
|
21 |
|
22 |
-
with open('input.txt', 'r', encoding='utf-8') as f:
|
23 |
-
|
24 |
-
|
25 |
-
# here are all the unique characters that occur in this text
|
26 |
-
chars = sorted(list(set(text)))
|
27 |
-
vocab_size = len(chars)
|
28 |
-
# create a mapping from characters to integers
|
29 |
-
stoi = { ch:i for i,ch in enumerate(chars) }
|
30 |
-
itos = { i:ch for i,ch in enumerate(chars) }
|
31 |
-
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
|
32 |
-
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
|
33 |
-
|
34 |
-
# Train and test splits
|
35 |
-
data = torch.tensor(encode(text), dtype=torch.long)
|
36 |
-
n = int(0.9*len(data)) # first 90% will be train, rest val
|
37 |
-
train_data = data[:n]
|
38 |
-
val_data = data[n:]
|
39 |
-
|
40 |
-
|
41 |
-
# data loading
|
42 |
-
def get_batch(split):
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
@torch.no_grad()
|
52 |
-
def estimate_loss():
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
|
65 |
class Head(nn.Module):
|
66 |
""" one head of self-attention """
|
@@ -134,18 +138,24 @@ class Block(nn.Module):
|
|
134 |
x = x + self.ffwd(self.ln2(x))
|
135 |
return x
|
136 |
|
|
|
137 |
# super simple bigram model
|
138 |
class BigramLanguageModel(nn.Module):
|
139 |
-
|
140 |
-
def __init__(self):
|
141 |
super().__init__()
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
145 |
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
|
146 |
-
self.ln_f = nn.LayerNorm(n_embd)
|
147 |
-
self.lm_head = nn.Linear(n_embd, vocab_size)
|
148 |
-
|
149 |
def forward(self, idx, targets=None):
|
150 |
B, T = idx.shape
|
151 |
|
@@ -184,14 +194,42 @@ class BigramLanguageModel(nn.Module):
|
|
184 |
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
|
185 |
return idx
|
186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
# Load the shakespeaere model
|
189 |
-
shakespeare_model = BigramLanguageModel().to(device) # Initialize an instance of your model
|
190 |
shakespeare_model.load_state_dict(torch.load('shakespeaere_language_model.pth', map_location=torch.device('cpu')))
|
191 |
shakespeare_model.eval() # Set the model to evaluation mode
|
192 |
|
193 |
# Load the wikipedia model
|
194 |
-
wikipedia_model = BigramLanguageModel().to(device) # Initialize an instance of your model
|
195 |
wikipedia_model.load_state_dict(torch.load('wikipedia_language_model.pth', map_location=torch.device('cpu')))
|
196 |
wikipedia_model.eval() # Set the model to evaluation mode
|
197 |
|
@@ -213,7 +251,6 @@ def generate_wikipedia_outputs(prompt=None, max_new_tokens=2000):
|
|
213 |
text_output = decode(wikipedia_model.generate(context, max_new_tokens=max_new_tokens)[0].tolist())
|
214 |
return text_output
|
215 |
|
216 |
-
import gradio as gr
|
217 |
|
218 |
title = "Nano GPT"
|
219 |
description = "Nano GPT trained on Shakespeare and Wikipedia datasets. It is trained on a very small amount of data to understand how GPT's are trained and built. <a href='https://github.com/karpathy/nanoGPT'>The implementation can be found here </a>"
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
from torch.nn import functional as F
|
4 |
+
import numpy as np
|
5 |
+
import random
|
6 |
+
import re
|
7 |
+
import gradio as gr
|
8 |
|
9 |
# hyperparameters
|
10 |
batch_size = 16 # how many independent sequences will we process in parallel?
|
|
|
23 |
torch.manual_seed(1337)
|
24 |
|
25 |
|
26 |
+
# with open('input.txt', 'r', encoding='utf-8') as f:
|
27 |
+
# text = f.read()
|
28 |
+
|
29 |
+
# # here are all the unique characters that occur in this text
|
30 |
+
# chars = sorted(list(set(text)))
|
31 |
+
# vocab_size = len(chars)
|
32 |
+
# # create a mapping from characters to integers
|
33 |
+
# stoi = { ch:i for i,ch in enumerate(chars) }
|
34 |
+
# itos = { i:ch for i,ch in enumerate(chars) }
|
35 |
+
# encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
|
36 |
+
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
|
37 |
+
|
38 |
+
# # Train and test splits
|
39 |
+
# data = torch.tensor(encode(text), dtype=torch.long)
|
40 |
+
# n = int(0.9*len(data)) # first 90% will be train, rest val
|
41 |
+
# train_data = data[:n]
|
42 |
+
# val_data = data[n:]
|
43 |
+
|
44 |
+
|
45 |
+
# # data loading
|
46 |
+
# def get_batch(split):
|
47 |
+
# # generate a small batch of data of inputs x and targets y
|
48 |
+
# data = train_data if split == 'train' else val_data
|
49 |
+
# ix = torch.randint(len(data) - block_size, (batch_size,))
|
50 |
+
# x = torch.stack([data[i:i+block_size] for i in ix])
|
51 |
+
# y = torch.stack([data[i+1:i+block_size+1] for i in ix])
|
52 |
+
# x, y = x.to(device), y.to(device)
|
53 |
+
# return x, y
|
54 |
+
|
55 |
+
# @torch.no_grad()
|
56 |
+
# def estimate_loss():
|
57 |
+
# out = {}
|
58 |
+
# model.eval()
|
59 |
+
# for split in ['train', 'val']:
|
60 |
+
# losses = torch.zeros(eval_iters)
|
61 |
+
# for k in range(eval_iters):
|
62 |
+
# X, Y = get_batch(split)
|
63 |
+
# logits, loss = model(X, Y)
|
64 |
+
# losses[k] = loss.item()
|
65 |
+
# out[split] = losses.mean()
|
66 |
+
# model.train()
|
67 |
+
# return out
|
68 |
|
69 |
class Head(nn.Module):
|
70 |
""" one head of self-attention """
|
|
|
138 |
x = x + self.ffwd(self.ln2(x))
|
139 |
return x
|
140 |
|
141 |
+
# super simple bigram model
|
142 |
# super simple bigram model
|
143 |
class BigramLanguageModel(nn.Module):
|
144 |
+
def __init__(self, dataset_text, n_embd):
|
|
|
145 |
super().__init__()
|
146 |
+
|
147 |
+
# Compute character-related parameters
|
148 |
+
self.chars = sorted(list(set(dataset_text)))
|
149 |
+
self.vocab_size = len(self.chars)
|
150 |
+
self.stoi = {ch: i for i, ch in enumerate(self.chars)}
|
151 |
+
self.itos = {i: ch for ch, i in self.stoi.items()}
|
152 |
+
|
153 |
+
self.token_embedding_table = nn.Embedding(self.vocab_size, n_embd)
|
154 |
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
155 |
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
|
156 |
+
self.ln_f = nn.LayerNorm(n_embd)
|
157 |
+
self.lm_head = nn.Linear(n_embd, self.vocab_size)
|
158 |
+
|
159 |
def forward(self, idx, targets=None):
|
160 |
B, T = idx.shape
|
161 |
|
|
|
194 |
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
|
195 |
return idx
|
196 |
|
197 |
+
# Reading shakespeare data
|
198 |
+
with open('input.txt', 'r', encoding='utf-8') as f:
|
199 |
+
shakespeare_text = f.read()
|
200 |
+
|
201 |
+
|
202 |
+
# Reading wikipedia data
|
203 |
+
DATA_PATH = 'wikisent2.txt'
|
204 |
+
# load wikipedia sentences
|
205 |
+
with open(DATA_PATH, 'r') as f:
|
206 |
+
lines = f.read().splitlines()
|
207 |
+
|
208 |
+
# Selecting 250k lines from the dataset.
|
209 |
+
random.seed(42)
|
210 |
+
texts = random.choices(lines, k=250000)
|
211 |
+
del lines
|
212 |
+
|
213 |
+
def preprocess(text):
|
214 |
+
text = re.sub('@.*?\s+', '', text) # Remove mentions
|
215 |
+
text = re.sub('#.*?\s+', '', text) # Remove hashtags
|
216 |
+
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) # Remove URLs
|
217 |
+
text = re.sub(r'[^\w\s\'.]', '', text) # Remove special characters except for single quotes and periods
|
218 |
+
text = re.sub('\s+', ' ', text) # Replace multiple spaces with a single space
|
219 |
+
text = re.sub('^\d+\s*|^\d+\.\d+\s*|^\d+\.\d+\.\d+\s*', '', text) # Remove digits at the start of sentences
|
220 |
+
text = text.strip() # Remove leading and trailing whitespace
|
221 |
+
return text
|
222 |
+
|
223 |
+
wiki_text = [preprocess(t) for t in texts]
|
224 |
+
wiki_text = '\n'.join(wiki_text)
|
225 |
|
226 |
# Load the shakespeaere model
|
227 |
+
shakespeare_model = BigramLanguageModel(shakespeare_text, n_embd).to(device) # Initialize an instance of your model
|
228 |
shakespeare_model.load_state_dict(torch.load('shakespeaere_language_model.pth', map_location=torch.device('cpu')))
|
229 |
shakespeare_model.eval() # Set the model to evaluation mode
|
230 |
|
231 |
# Load the wikipedia model
|
232 |
+
wikipedia_model = BigramLanguageModel(wiki_text, n_embd).to(device) # Initialize an instance of your model
|
233 |
wikipedia_model.load_state_dict(torch.load('wikipedia_language_model.pth', map_location=torch.device('cpu')))
|
234 |
wikipedia_model.eval() # Set the model to evaluation mode
|
235 |
|
|
|
251 |
text_output = decode(wikipedia_model.generate(context, max_new_tokens=max_new_tokens)[0].tolist())
|
252 |
return text_output
|
253 |
|
|
|
254 |
|
255 |
title = "Nano GPT"
|
256 |
description = "Nano GPT trained on Shakespeare and Wikipedia datasets. It is trained on a very small amount of data to understand how GPT's are trained and built. <a href='https://github.com/karpathy/nanoGPT'>The implementation can be found here </a>"
|