abhaskumarsinha
commited on
Commit
•
d891407
1
Parent(s):
f36f46c
Upload 2 files
Browse files- GPT.py +226 -0
- MinimalGPT_2.py +401 -0
GPT.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import tensorflow as tf
|
4 |
+
import math
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
def scaled_dot_product_attention(q, k, v):
|
8 |
+
# calculate the dot product of query and key
|
9 |
+
dot_product = tf.matmul(q, k, transpose_b=True)
|
10 |
+
|
11 |
+
|
12 |
+
# scale the dot product
|
13 |
+
scaled_dot_product = dot_product / tf.math.sqrt(tf.cast(tf.shape(k)[-1], dtype=tf.float32))
|
14 |
+
|
15 |
+
# apply softmax activation to obtain attention weights
|
16 |
+
attention_weights = tf.nn.softmax(scaled_dot_product, axis=-1)
|
17 |
+
|
18 |
+
# compute the weighted sum of the value vectors with attention weights
|
19 |
+
output = tf.matmul(attention_weights, v)
|
20 |
+
|
21 |
+
return output
|
22 |
+
|
23 |
+
|
24 |
+
class LinearLayer(tf.keras.layers.Layer):
|
25 |
+
def __init__(self, ix, ox):
|
26 |
+
super().__init__()
|
27 |
+
self.ix = ix
|
28 |
+
self.ox = ox
|
29 |
+
|
30 |
+
|
31 |
+
def build(self, input_shapes):
|
32 |
+
self.w1 = self.add_weight(shape=(self.ix, self.ox))
|
33 |
+
self.b1 = self.add_weight(shape=(1, self.ox))
|
34 |
+
|
35 |
+
def call(self, inputs):
|
36 |
+
bz, key = tf.shape(inputs)[0], tf.shape(inputs)[1]
|
37 |
+
inputs = tf.reshape(inputs, (-1, self.ix))
|
38 |
+
inputs = tf.matmul(inputs, self.w1) + self.b1
|
39 |
+
inputs = tf.reshape(inputs, (bz, key, self.ox))
|
40 |
+
return inputs
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
class split_heads(tf.keras.layers.Layer):
|
45 |
+
def __init__(self, num_heads = 10):
|
46 |
+
super().__init__()
|
47 |
+
self.num_heads = num_heads
|
48 |
+
|
49 |
+
def call(self, inputs):
|
50 |
+
bz, key = tf.shape(inputs)[0], tf.shape(inputs)[1]
|
51 |
+
|
52 |
+
inputs = tf.reshape(inputs, (bz, key, self.num_heads, -1))
|
53 |
+
inputs = tf.transpose(inputs, (0, 2, 1, 3))
|
54 |
+
|
55 |
+
return inputs
|
56 |
+
|
57 |
+
|
58 |
+
class merge_heads(tf.keras.layers.Layer):
|
59 |
+
def __init__(self):
|
60 |
+
super().__init__()
|
61 |
+
|
62 |
+
def call(self, inputs):
|
63 |
+
bz, key = tf.shape(inputs)[0], tf.shape(inputs)[2]
|
64 |
+
|
65 |
+
inputs = tf.transpose(inputs, (0, 2, 1, 3))
|
66 |
+
inputs = tf.reshape(inputs, (bz, key, -1))
|
67 |
+
return inputs
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
class GPT_Attention(tf.keras.layers.Layer):
|
72 |
+
|
73 |
+
def __init__(self, ix, ox, num_heads):
|
74 |
+
super().__init__()
|
75 |
+
self.ix = ix
|
76 |
+
self.ox = ox
|
77 |
+
self.num_heads = num_heads
|
78 |
+
self.linear1 = LinearLayer(self.ix, self.ox * 3)
|
79 |
+
self.split = split_heads(num_heads = self.num_heads)
|
80 |
+
self.merge = merge_heads()
|
81 |
+
self.linear2 = LinearLayer(self.ox, self.ix)
|
82 |
+
|
83 |
+
if self.ox % self.num_heads != 0:
|
84 |
+
raise ValueError('The value ox = '+ str(self.ox) +' SHOULD be divisible by number of heads provided')
|
85 |
+
|
86 |
+
def call(self, inputs):
|
87 |
+
if len(inputs) > 0:
|
88 |
+
inputs = inputs[0]
|
89 |
+
inputs = self.linear1(inputs)
|
90 |
+
k, q, v = tf.split(inputs, 3, axis = -1)
|
91 |
+
k = self.split(k)
|
92 |
+
q = self.split(q)
|
93 |
+
v = self.split(v)
|
94 |
+
#k, q, v = tf.split(inputs, 3, axis = -1)
|
95 |
+
inputs = scaled_dot_product_attention(k, q, v)
|
96 |
+
inputs = self.merge(inputs)
|
97 |
+
inputs = self.linear2(inputs)
|
98 |
+
|
99 |
+
return inputs
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
class MultiHeadAttention(tf.keras.layers.Layer):
|
104 |
+
def __init__(self, num_heads = 8, key_dim = 64, key_embedding = 512):
|
105 |
+
super(MultiHeadAttention, self).__init__()
|
106 |
+
self.num_heads = num_heads
|
107 |
+
self.key_dim = key_dim
|
108 |
+
self.key_embedding = key_embedding
|
109 |
+
self.head_vectors = []
|
110 |
+
|
111 |
+
def build(self, input_shape):
|
112 |
+
#print(input_shape)
|
113 |
+
|
114 |
+
self.W_k = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='key')
|
115 |
+
self.W_q = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='query')
|
116 |
+
self.W_v = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='value')
|
117 |
+
|
118 |
+
self.W_o = self.add_weight(shape=(self.key_dim, self.key_embedding))
|
119 |
+
|
120 |
+
|
121 |
+
def call(self, inputs):
|
122 |
+
query, key, value = inputs
|
123 |
+
|
124 |
+
self.head_vectors = []
|
125 |
+
head_concat = None
|
126 |
+
|
127 |
+
for i in range(self.num_heads):
|
128 |
+
q = tf.einsum('bij, ij -> bij', query, self.W_q[i])
|
129 |
+
k = tf.einsum('bij, ij -> bij', key, self.W_k[i])
|
130 |
+
v = tf.einsum('bij, ij -> bij', value, self.W_v[i])
|
131 |
+
|
132 |
+
self.head_vectors += [scaled_dot_product_attention(q, k, v)]
|
133 |
+
|
134 |
+
|
135 |
+
head_concat = tf.concat(self.head_vectors, -2)
|
136 |
+
#print(tf.shape(head_concat))
|
137 |
+
output =tf.einsum('bij, kj -> bkj', head_concat, self.W_o)
|
138 |
+
|
139 |
+
|
140 |
+
return output
|
141 |
+
|
142 |
+
class Decoder(tf.keras.layers.Layer):
|
143 |
+
def __init__(self, num_heads = 8, key_dim = 64, key_embedding = 512, GPT_attention = False):
|
144 |
+
super(Decoder, self).__init__()
|
145 |
+
|
146 |
+
self.num_heads = num_heads
|
147 |
+
self.key_dim = key_dim
|
148 |
+
self.key_embedding = key_embedding
|
149 |
+
if GPT_attention:
|
150 |
+
self.attention = GPT_Attention(key_embedding, key_embedding, num_heads)
|
151 |
+
else:
|
152 |
+
self.attention = MultiHeadAttention(num_heads = num_heads, key_dim = key_dim, key_embedding = key_embedding)
|
153 |
+
self.normalize1 = tf.keras.layers.LayerNormalization(axis = -2)
|
154 |
+
self.normalize2 = tf.keras.layers.LayerNormalization(axis = -2)
|
155 |
+
|
156 |
+
|
157 |
+
def build(self, input_shape):
|
158 |
+
#print(input_shape)
|
159 |
+
|
160 |
+
self.x1 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='vec1')
|
161 |
+
self.x2 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='vec2')
|
162 |
+
|
163 |
+
self.y1 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='bias1')
|
164 |
+
self.y2 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='bias2')
|
165 |
+
|
166 |
+
def call(self, inputs):
|
167 |
+
|
168 |
+
first_sublayer_output = self.attention((inputs, inputs, inputs))
|
169 |
+
first_sublayer_output = self.normalize1(first_sublayer_output + inputs)
|
170 |
+
|
171 |
+
first_nn = tf.einsum('bij, ij -> bij', first_sublayer_output, self.x1) + self.y1
|
172 |
+
first_nn = tf.keras.activations.relu(first_nn, alpha=0.0, max_value=None, threshold=0.0)
|
173 |
+
second_nn = tf.einsum('bij, ij -> bij', first_nn, self.x2) + self.y2
|
174 |
+
|
175 |
+
second_sublayer_output = self.normalize2(second_nn + first_sublayer_output)
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
return second_sublayer_output
|
180 |
+
|
181 |
+
def positional_function(words, embedding):
|
182 |
+
pos = np.zeros((words, embedding))
|
183 |
+
|
184 |
+
for i in range(words):
|
185 |
+
for j in range(embedding):
|
186 |
+
if j%2 == 0:
|
187 |
+
pos[i, j] = math.sin(i/pow(10000, 2*j/(512)))
|
188 |
+
else:
|
189 |
+
pos[i, j] = math.cos(i/pow(10000, 2*j/(512)))
|
190 |
+
|
191 |
+
return pos
|
192 |
+
|
193 |
+
|
194 |
+
class PositionalEmbedding(tf.keras.layers.Layer):
|
195 |
+
def __init__(self, positional_function = positional_function, embedding_size = 512, words = 64):
|
196 |
+
super(PositionalEmbedding, self).__init__()
|
197 |
+
self.embedding_size = embedding_size
|
198 |
+
self.words = words
|
199 |
+
self.pos_mat = tf.cast(tf.convert_to_tensor(positional_function(self.words, self.embedding_size)), tf.float32)
|
200 |
+
|
201 |
+
def build(self, input_sizes):
|
202 |
+
print(input_sizes)
|
203 |
+
|
204 |
+
def call(self, inputs):
|
205 |
+
embed = tf.einsum("bij, ij -> bij", inputs, self.pos_mat)
|
206 |
+
return embed
|
207 |
+
|
208 |
+
def generate_output(model, vectorizer, text_size = 70, gpt_input = 64, input_sequence = []):
|
209 |
+
|
210 |
+
if input_sequence == []:
|
211 |
+
input_sequence = tf.zeros((1, gpt_input)).numpy()
|
212 |
+
|
213 |
+
text = tf.zeros((1, text_size)).numpy()
|
214 |
+
text[0][: gpt_input] = input_sequence[0][: gpt_input]
|
215 |
+
|
216 |
+
GPT = model
|
217 |
+
|
218 |
+
|
219 |
+
for i in tqdm(range(gpt_input, text_size)):
|
220 |
+
#print("Iteration number:" + str(i))
|
221 |
+
output = tf.argmax(GPT(input_sequence), -1).numpy()
|
222 |
+
text[0][i - 1] = output
|
223 |
+
input_sequence = text[0][i - gpt_input : i].reshape(1, gpt_input)
|
224 |
+
|
225 |
+
op = [vectorizer.get_vocabulary()[int(text[0][i])] for i in range(len(text[0]))]
|
226 |
+
return ' '.join(op)
|
MinimalGPT_2.py
ADDED
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import tensorflow as tf
|
4 |
+
from tqdm import tqdm
|
5 |
+
from GPT import *
|
6 |
+
import pickle
|
7 |
+
import argparse
|
8 |
+
import sys
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def save_module(save_weights, model, vectorizer, save_tokenizer):
|
13 |
+
|
14 |
+
# Save the GPT Model
|
15 |
+
with open(save_weights, 'wb') as file:
|
16 |
+
pickle.dump(model.weights, file)
|
17 |
+
|
18 |
+
#Save the Vectorizer Model
|
19 |
+
vocabulary = vectorizer.get_vocabulary()
|
20 |
+
|
21 |
+
# Encode the vocabulary as JSON-compatible strings
|
22 |
+
encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary]
|
23 |
+
encoded_vocabulary = encoded_vocabulary[2:]
|
24 |
+
|
25 |
+
# Save the encoded vocabulary to a JSON file
|
26 |
+
with open(save_tokenizer, 'w') as f:
|
27 |
+
json.dump(encoded_vocabulary, f)
|
28 |
+
print("Vocabulary size saved: " + str(len(encoded_vocabulary)))
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def read_file(f, vectorizer, chunk_size = 1024, starting_chunk = 0, ending_chunk = 5, gpt_input = 10):
|
35 |
+
i = 0
|
36 |
+
chunk = []
|
37 |
+
|
38 |
+
while True:
|
39 |
+
data = f.read(chunk_size)
|
40 |
+
|
41 |
+
if not data or i > ending_chunk:
|
42 |
+
break
|
43 |
+
|
44 |
+
if i >= starting_chunk and i <= ending_chunk:
|
45 |
+
file_contents = data.split()
|
46 |
+
input_tokens, output_tokens = [], []
|
47 |
+
for j in range(len(file_contents) - gpt_input - 1):
|
48 |
+
input_tokens += [file_contents[j : j + gpt_input]]
|
49 |
+
output_tokens += [file_contents[j + gpt_input]]
|
50 |
+
|
51 |
+
|
52 |
+
X = [' '.join(input_tokens[j]) for j in range(len(input_tokens))]
|
53 |
+
Y = output_tokens
|
54 |
+
|
55 |
+
X = vectorizer(X)
|
56 |
+
Y = vectorizer(Y)
|
57 |
+
|
58 |
+
output = tf.concat([X, Y], 1)
|
59 |
+
|
60 |
+
yield output
|
61 |
+
|
62 |
+
i += 1
|
63 |
+
|
64 |
+
|
65 |
+
def get_model(gpt_input, d_model, h, vocab_size, decoder_stacks, GPT_attention):
|
66 |
+
input_words = tf.keras.layers.Input((gpt_input))
|
67 |
+
embedding = tf.keras.layers.Embedding(vocab_size + 2, d_model)(input_words)
|
68 |
+
positional_enc = PositionalEmbedding(words = gpt_input, embedding_size = d_model)(embedding)
|
69 |
+
decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(positional_enc)
|
70 |
+
|
71 |
+
for _ in range(decoder_stacks - 1):
|
72 |
+
decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(decoder)
|
73 |
+
|
74 |
+
decoder = tf.keras.layers.Flatten()(decoder)
|
75 |
+
linear_layer = tf.keras.layers.Dense(vocab_size + 3)(decoder)
|
76 |
+
softmax = tf.nn.softmax(linear_layer)
|
77 |
+
GPT = tf.keras.Model(inputs = input_words, outputs = softmax)
|
78 |
+
|
79 |
+
return GPT
|
80 |
+
|
81 |
+
|
82 |
+
def MinimalGPT(data_path='.',
|
83 |
+
learning_rate=0,
|
84 |
+
output_length=0,
|
85 |
+
epochs = 1,
|
86 |
+
batch_size = 1,
|
87 |
+
gpt_input=10,
|
88 |
+
d_model=128,
|
89 |
+
h=8,
|
90 |
+
decoder_stacks=1,
|
91 |
+
starting_chunk = 0,
|
92 |
+
ending_chunk = 5,
|
93 |
+
chunk_size = 10,
|
94 |
+
token_end=40000,
|
95 |
+
vocabulary_start = 0,
|
96 |
+
vocabulary_end = 40000,
|
97 |
+
save=False,
|
98 |
+
load_tokenizer=None,
|
99 |
+
load_weights=None,
|
100 |
+
save_tokenizer=None,
|
101 |
+
save_weights=None,
|
102 |
+
optimizer=None,
|
103 |
+
inference_only = False,
|
104 |
+
return_model_and_vectorizer = False,
|
105 |
+
return_model_and_vectorizer_and_output = False,
|
106 |
+
GPT_attention = False,
|
107 |
+
TPU = False):
|
108 |
+
|
109 |
+
if chunk_size:
|
110 |
+
chunk_size *= 1024
|
111 |
+
|
112 |
+
|
113 |
+
if inference_only == False:
|
114 |
+
with open(data_path, 'r', encoding = 'utf-8') as file:
|
115 |
+
corpus = file.read()
|
116 |
+
#file_contents = corpus.split()[token_start : token_end]
|
117 |
+
#print("Total tokens: " + str(len(file_contents)))
|
118 |
+
|
119 |
+
|
120 |
+
if load_tokenizer:
|
121 |
+
with open(load_tokenizer, 'r') as f:
|
122 |
+
encoded_vocabulary = json.load(f)
|
123 |
+
|
124 |
+
# Decode the encoded vocabulary to original strings
|
125 |
+
vocabulary = [word.encode('utf-8').decode('unicode_escape') for word in encoded_vocabulary]
|
126 |
+
vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace')
|
127 |
+
vectorizer.set_vocabulary(vocabulary)
|
128 |
+
vocab_size = vectorizer.vocabulary_size()
|
129 |
+
|
130 |
+
else:
|
131 |
+
vocab = []
|
132 |
+
for word in tqdm(corpus.split()[vocabulary_start : vocabulary_end]):
|
133 |
+
vocab += [word]
|
134 |
+
vocab = list(set(vocab))
|
135 |
+
vocab_size = len(vocab)
|
136 |
+
vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace', vocabulary = vocab)
|
137 |
+
print('New Vectorizer created successfully...')
|
138 |
+
print("Vocabulary Size: " + str(vocab_size))
|
139 |
+
del corpus
|
140 |
+
|
141 |
+
|
142 |
+
#if inference_only == False:
|
143 |
+
# input_tokens, output_tokens = [], []
|
144 |
+
# for i in tqdm(range(len(file_contents) - gpt_input - 1)):
|
145 |
+
# input_tokens += [file_contents[i : i + gpt_input]]
|
146 |
+
# output_tokens += [file_contents[i + gpt_input]]
|
147 |
+
|
148 |
+
|
149 |
+
# X = [' '.join(input_tokens[i]) for i in tqdm(range(len(input_tokens)))]
|
150 |
+
# Y = output_tokens
|
151 |
+
|
152 |
+
# del corpus
|
153 |
+
|
154 |
+
# X = vectorizer(X)
|
155 |
+
# Y = vectorizer(Y)
|
156 |
+
|
157 |
+
if load_weights:
|
158 |
+
model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size - 2, GPT_attention = GPT_attention)
|
159 |
+
|
160 |
+
with open(load_weights, 'rb') as file:
|
161 |
+
W = pickle.load(file)
|
162 |
+
model.set_weights(W)
|
163 |
+
else:
|
164 |
+
model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size, GPT_attention = GPT_attention)
|
165 |
+
|
166 |
+
|
167 |
+
print(model.summary())
|
168 |
+
|
169 |
+
|
170 |
+
if inference_only == False:
|
171 |
+
# Compile the model
|
172 |
+
if not optimizer:
|
173 |
+
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy')
|
174 |
+
else:
|
175 |
+
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
|
176 |
+
|
177 |
+
# Train the model
|
178 |
+
if learning_rate > 0:
|
179 |
+
|
180 |
+
for epoch in tqdm(range(epochs)):
|
181 |
+
|
182 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
183 |
+
chunk_number = 1
|
184 |
+
for chunk in read_file(f,
|
185 |
+
vectorizer,
|
186 |
+
chunk_size,
|
187 |
+
starting_chunk,
|
188 |
+
ending_chunk,
|
189 |
+
gpt_input):
|
190 |
+
print('Chunk_size: ' + str(chunk.shape[0]))
|
191 |
+
model.fit(chunk[:, :gpt_input], tf.reshape(chunk[:, -1], (-1, 1)), batch_size = batch_size, epochs=1)
|
192 |
+
print("Chunk Number " + str(chunk_number) + "/" +str(ending_chunk - starting_chunk + 1) + " processed!")
|
193 |
+
chunk_number += 1
|
194 |
+
|
195 |
+
|
196 |
+
# Print the output of the Model
|
197 |
+
output_seq = generate_output(gpt_input = gpt_input, model = model, vectorizer = vectorizer, text_size = output_length, input_sequence = [])
|
198 |
+
|
199 |
+
if save == True and TPU == False:
|
200 |
+
print('Saveeeeee')
|
201 |
+
|
202 |
+
save_module(save_weights, model, vectorizer, save_tokenizer)
|
203 |
+
|
204 |
+
if save == True and TPU == True:
|
205 |
+
|
206 |
+
return save_weights, model, vectorizer, save_tokenizer, output_seq
|
207 |
+
# Save the GPT Model
|
208 |
+
#with open(save_weights, 'wb') as file:
|
209 |
+
# pickle.dump(model.weights, file)
|
210 |
+
|
211 |
+
#Save the Vectorizer Model
|
212 |
+
#vocabulary = vectorizer.get_vocabulary()
|
213 |
+
|
214 |
+
# Encode the vocabulary as JSON-compatible strings
|
215 |
+
#encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary]
|
216 |
+
#encoded_vocabulary = encoded_vocabulary[2:]
|
217 |
+
|
218 |
+
# Save the encoded vocabulary to a JSON file
|
219 |
+
#with open(save_tokenizer, 'w') as f:
|
220 |
+
# json.dump(encoded_vocabulary, f)
|
221 |
+
# print("Vocabulary size saved: " + str(len(encoded_vocabulary)))
|
222 |
+
|
223 |
+
|
224 |
+
if return_model_and_vectorizer:
|
225 |
+
return model, vectorizer
|
226 |
+
elif return_model_and_vectorizer_and_output:
|
227 |
+
return model, vectorizer, output_seq.replace('@@ ', '')
|
228 |
+
else:
|
229 |
+
return output_seq.replace('@@ ', '')
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
# Example code to execute when the script file is called
|
234 |
+
|
235 |
+
def main():
|
236 |
+
print("This code is executed when the script file is called directly.")
|
237 |
+
|
238 |
+
# Check if the script is being run as the main module
|
239 |
+
if __name__ == '__main__':
|
240 |
+
parser = argparse.ArgumentParser()
|
241 |
+
parser.add_argument('-d', '--data-path', help='File: Corresponding to corpus or training text [String]')
|
242 |
+
parser.add_argument('-l', '--learning-rate', help='Float: Learning Rate. The model will train ONLY IF the rate is > 0, skip otherwise [Float]', type=float)
|
243 |
+
parser.add_argument('-ol', '--output-length', help='Length of the output sequence to be generated', type=int)
|
244 |
+
parser.add_argument('-e', '--epochs', help='Number of training Epochs [Int]', type=int)
|
245 |
+
parser.add_argument('-b', '--batch-size', help='Size of each batch [Int]', type=int)
|
246 |
+
parser.add_argument('-s', '--gpt-input', help='Number of Tokens of text the model inputs at a time [Int]', type=int)
|
247 |
+
parser.add_argument('-dm', '--d-model', help='Embedding layer output dimensions [Int]', type=int)
|
248 |
+
parser.add_argument('-p', '--multi-head', help='Number of Multi-head Attention layer in parallel [Int]', type=int)
|
249 |
+
parser.add_argument('-ds', '--decoder-stacks', help='Number of stacked Decoder layer [Int]', type=int)
|
250 |
+
parser.add_argument('-sc', '--chunk-start', help='The chunk number in the corpus to mark it as the starting point of the training [Int]', type=int)
|
251 |
+
parser.add_argument('-ec', '--chunk-end', help='The chunk number in the corpus to mark it as the end point of the training [Int]', type=int)
|
252 |
+
parser.add_argument('-csz', '--chunk-size', help='The size of each chunk in KB.', type=int)
|
253 |
+
parser.add_argument('-vs', '--vocabulary-start', help='Token number from the corpus to mark the starting point of vocabulary data [Int]', type=int)
|
254 |
+
parser.add_argument('-ve', '--vocabulary-end', help='Token number from the corpus to mark the end point of vocabulary data [Int]', type=int)
|
255 |
+
parser.add_argument('-sd', '--save', help='Save the Model and Vectorizer data to disk [True/False]', action='store_true')
|
256 |
+
parser.add_argument('-lt', '--load-tokenizer', help='File: Vectorization layer [File]')
|
257 |
+
parser.add_argument('-lw', '--load-weights', help='File: Model Weights [File]')
|
258 |
+
parser.add_argument('-st', '--save-tokenizer', help='File: Saving Vectorizer File [File]')
|
259 |
+
parser.add_argument('-sw', '--save-weights', help='File: Saving Model Weights[File]')
|
260 |
+
parser.add_argument('-ot', '--optimizer', help='Optimizer consistent to TensorFlow optimizer class [tf.keras.optimizers]')
|
261 |
+
parser.add_argument('-i', '--inference-only', help='Only Print the output of the model in Inference Mode [True/False]', action='store_true')
|
262 |
+
parser.add_argument('-mv', '--model-vectorizer', help='Return Model, Vectorizer Tuple [True/False]', action='store_true')
|
263 |
+
parser.add_argument('-mvo', '--model-vectorizer-output', help='Return Model, Vectorizer, Output Tuple [True/False]', action='store_true')
|
264 |
+
parser.add_argument('-ga', '--gpt-style-attention', help='Uses GPT-styled attention. Note: (d-model) parameter should be divisible by (multi-head), otherwise the program will throw an error! [True/False]', action='store_true')
|
265 |
+
parser.add_argument('-tpu', '--TPU', help='Use Tensor Processor Units (Distributed Learning)', action='store_true')
|
266 |
+
|
267 |
+
|
268 |
+
args = parser.parse_args()
|
269 |
+
|
270 |
+
|
271 |
+
data_path = args.data_path
|
272 |
+
learning_rate = args.learning_rate
|
273 |
+
output_length = args.output_length
|
274 |
+
epochs = args.epochs
|
275 |
+
batch_size = args.batch_size
|
276 |
+
gpt_input = args.gpt_input
|
277 |
+
d_model = args.d_model
|
278 |
+
h = args.multi_head
|
279 |
+
stacks = args.decoder_stacks
|
280 |
+
chunk_start = args.chunk_start
|
281 |
+
chunk_end = args.chunk_end
|
282 |
+
chunk_size = args.chunk_size
|
283 |
+
vocabulary_start = args.vocabulary_start
|
284 |
+
vocabulary_end = args.vocabulary_end
|
285 |
+
save = args.save
|
286 |
+
load_tokenizer = args.load_tokenizer
|
287 |
+
load_weights = args.load_weights
|
288 |
+
save_tokenizer = args.save_tokenizer
|
289 |
+
save_weights = args.save_weights
|
290 |
+
optimizer = args.optimizer
|
291 |
+
inference_only = args.inference_only
|
292 |
+
model_and_vectorizer = args.model_vectorizer
|
293 |
+
GPT_attention = args.gpt_style_attention
|
294 |
+
model_vectorizer_output = args.model_vectorizer_output
|
295 |
+
|
296 |
+
|
297 |
+
|
298 |
+
configuration = {
|
299 |
+
'data_path': args.data_path,
|
300 |
+
'learning_rate': args.learning_rate,
|
301 |
+
'output_length': args.output_length,
|
302 |
+
'epochs': args.epochs,
|
303 |
+
'batch_size': args.batch_size,
|
304 |
+
'gpt_input': args.gpt_input,
|
305 |
+
'd_model': args.d_model,
|
306 |
+
'h': args.multi_head,
|
307 |
+
'stacks': args.decoder_stacks,
|
308 |
+
'chunk_start': args.chunk_start,
|
309 |
+
'chunk_end': args.chunk_end,
|
310 |
+
'chunk_size': args.chunk_size,
|
311 |
+
'vocabulary_start': args.vocabulary_start,
|
312 |
+
'vocabulary_end': args.vocabulary_end,
|
313 |
+
'save': args.save,
|
314 |
+
'load_tokenizer': args.load_tokenizer,
|
315 |
+
'load_weights': args.load_weights,
|
316 |
+
'save_tokenizer': args.save_tokenizer,
|
317 |
+
'save_weights': args.save_weights,
|
318 |
+
'optimizer': args.optimizer,
|
319 |
+
'inference_only': args.inference_only,
|
320 |
+
'model_and_vectorizer': args.model_vectorizer,
|
321 |
+
'model_vectorizer_output': args.model_vectorizer_output,
|
322 |
+
'GPT_Attention' : args.gpt_style_attention
|
323 |
+
}
|
324 |
+
|
325 |
+
# Save the configuration to a JSON file
|
326 |
+
with open('last-configuration.json', 'w') as file:
|
327 |
+
json.dump(configuration, file)
|
328 |
+
|
329 |
+
|
330 |
+
|
331 |
+
if args.TPU == True:
|
332 |
+
|
333 |
+
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
|
334 |
+
tf.config.experimental_connect_to_cluster(resolver)
|
335 |
+
# This is the TPU initialization code that has to be at the beginning.
|
336 |
+
tf.tpu.experimental.initialize_tpu_system(resolver)
|
337 |
+
print("All devices: ", tf.config.list_logical_devices('TPU'))
|
338 |
+
|
339 |
+
|
340 |
+
strategy = tf.distribute.TPUStrategy(resolver)
|
341 |
+
|
342 |
+
with strategy.scope():
|
343 |
+
|
344 |
+
output = MinimalGPT(data_path = data_path,
|
345 |
+
learning_rate = learning_rate,
|
346 |
+
output_length = output_length,
|
347 |
+
epochs = epochs,
|
348 |
+
batch_size = batch_size,
|
349 |
+
gpt_input = gpt_input,
|
350 |
+
d_model = d_model,
|
351 |
+
h = h,
|
352 |
+
decoder_stacks = stacks,
|
353 |
+
starting_chunk = chunk_start,
|
354 |
+
ending_chunk = chunk_end,
|
355 |
+
chunk_size = chunk_size,
|
356 |
+
vocabulary_start = vocabulary_start,
|
357 |
+
vocabulary_end = vocabulary_end,
|
358 |
+
save = save,
|
359 |
+
load_tokenizer = load_tokenizer,
|
360 |
+
load_weights = load_weights,
|
361 |
+
save_tokenizer = save_tokenizer,
|
362 |
+
save_weights = save_weights,
|
363 |
+
optimizer = optimizer,
|
364 |
+
inference_only = inference_only,
|
365 |
+
return_model_and_vectorizer = model_and_vectorizer,
|
366 |
+
return_model_and_vectorizer_and_output = model_vectorizer_output,
|
367 |
+
GPT_attention = GPT_attention,
|
368 |
+
TPU = True)
|
369 |
+
|
370 |
+
save_module(output[0], output[1], output[2], output[3])
|
371 |
+
|
372 |
+
print(output[4])
|
373 |
+
sys.exit(0)
|
374 |
+
|
375 |
+
|
376 |
+
output = MinimalGPT(data_path = data_path,
|
377 |
+
learning_rate = learning_rate,
|
378 |
+
output_length = output_length,
|
379 |
+
epochs = epochs,
|
380 |
+
batch_size = batch_size,
|
381 |
+
gpt_input = gpt_input,
|
382 |
+
d_model = d_model,
|
383 |
+
h = h,
|
384 |
+
decoder_stacks = stacks,
|
385 |
+
starting_chunk = chunk_start,
|
386 |
+
ending_chunk = chunk_end,
|
387 |
+
chunk_size = chunk_size,
|
388 |
+
vocabulary_start = vocabulary_start,
|
389 |
+
vocabulary_end = vocabulary_end,
|
390 |
+
save = save,
|
391 |
+
load_tokenizer = load_tokenizer,
|
392 |
+
load_weights = load_weights,
|
393 |
+
save_tokenizer = save_tokenizer,
|
394 |
+
save_weights = save_weights,
|
395 |
+
optimizer = optimizer,
|
396 |
+
inference_only = inference_only,
|
397 |
+
return_model_and_vectorizer = model_and_vectorizer,
|
398 |
+
return_model_and_vectorizer_and_output = model_vectorizer_output,
|
399 |
+
GPT_attention = GPT_attention,
|
400 |
+
TPU = False)
|
401 |
+
print(output)
|