abhaskumarsinha commited on
Commit
d891407
1 Parent(s): f36f46c

Upload 2 files

Browse files
Files changed (2) hide show
  1. GPT.py +226 -0
  2. MinimalGPT_2.py +401 -0
GPT.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import tensorflow as tf
4
+ import math
5
+ from tqdm import tqdm
6
+
7
+ def scaled_dot_product_attention(q, k, v):
8
+ # calculate the dot product of query and key
9
+ dot_product = tf.matmul(q, k, transpose_b=True)
10
+
11
+
12
+ # scale the dot product
13
+ scaled_dot_product = dot_product / tf.math.sqrt(tf.cast(tf.shape(k)[-1], dtype=tf.float32))
14
+
15
+ # apply softmax activation to obtain attention weights
16
+ attention_weights = tf.nn.softmax(scaled_dot_product, axis=-1)
17
+
18
+ # compute the weighted sum of the value vectors with attention weights
19
+ output = tf.matmul(attention_weights, v)
20
+
21
+ return output
22
+
23
+
24
+ class LinearLayer(tf.keras.layers.Layer):
25
+ def __init__(self, ix, ox):
26
+ super().__init__()
27
+ self.ix = ix
28
+ self.ox = ox
29
+
30
+
31
+ def build(self, input_shapes):
32
+ self.w1 = self.add_weight(shape=(self.ix, self.ox))
33
+ self.b1 = self.add_weight(shape=(1, self.ox))
34
+
35
+ def call(self, inputs):
36
+ bz, key = tf.shape(inputs)[0], tf.shape(inputs)[1]
37
+ inputs = tf.reshape(inputs, (-1, self.ix))
38
+ inputs = tf.matmul(inputs, self.w1) + self.b1
39
+ inputs = tf.reshape(inputs, (bz, key, self.ox))
40
+ return inputs
41
+
42
+
43
+
44
+ class split_heads(tf.keras.layers.Layer):
45
+ def __init__(self, num_heads = 10):
46
+ super().__init__()
47
+ self.num_heads = num_heads
48
+
49
+ def call(self, inputs):
50
+ bz, key = tf.shape(inputs)[0], tf.shape(inputs)[1]
51
+
52
+ inputs = tf.reshape(inputs, (bz, key, self.num_heads, -1))
53
+ inputs = tf.transpose(inputs, (0, 2, 1, 3))
54
+
55
+ return inputs
56
+
57
+
58
+ class merge_heads(tf.keras.layers.Layer):
59
+ def __init__(self):
60
+ super().__init__()
61
+
62
+ def call(self, inputs):
63
+ bz, key = tf.shape(inputs)[0], tf.shape(inputs)[2]
64
+
65
+ inputs = tf.transpose(inputs, (0, 2, 1, 3))
66
+ inputs = tf.reshape(inputs, (bz, key, -1))
67
+ return inputs
68
+
69
+
70
+
71
+ class GPT_Attention(tf.keras.layers.Layer):
72
+
73
+ def __init__(self, ix, ox, num_heads):
74
+ super().__init__()
75
+ self.ix = ix
76
+ self.ox = ox
77
+ self.num_heads = num_heads
78
+ self.linear1 = LinearLayer(self.ix, self.ox * 3)
79
+ self.split = split_heads(num_heads = self.num_heads)
80
+ self.merge = merge_heads()
81
+ self.linear2 = LinearLayer(self.ox, self.ix)
82
+
83
+ if self.ox % self.num_heads != 0:
84
+ raise ValueError('The value ox = '+ str(self.ox) +' SHOULD be divisible by number of heads provided')
85
+
86
+ def call(self, inputs):
87
+ if len(inputs) > 0:
88
+ inputs = inputs[0]
89
+ inputs = self.linear1(inputs)
90
+ k, q, v = tf.split(inputs, 3, axis = -1)
91
+ k = self.split(k)
92
+ q = self.split(q)
93
+ v = self.split(v)
94
+ #k, q, v = tf.split(inputs, 3, axis = -1)
95
+ inputs = scaled_dot_product_attention(k, q, v)
96
+ inputs = self.merge(inputs)
97
+ inputs = self.linear2(inputs)
98
+
99
+ return inputs
100
+
101
+
102
+
103
+ class MultiHeadAttention(tf.keras.layers.Layer):
104
+ def __init__(self, num_heads = 8, key_dim = 64, key_embedding = 512):
105
+ super(MultiHeadAttention, self).__init__()
106
+ self.num_heads = num_heads
107
+ self.key_dim = key_dim
108
+ self.key_embedding = key_embedding
109
+ self.head_vectors = []
110
+
111
+ def build(self, input_shape):
112
+ #print(input_shape)
113
+
114
+ self.W_k = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='key')
115
+ self.W_q = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='query')
116
+ self.W_v = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='value')
117
+
118
+ self.W_o = self.add_weight(shape=(self.key_dim, self.key_embedding))
119
+
120
+
121
+ def call(self, inputs):
122
+ query, key, value = inputs
123
+
124
+ self.head_vectors = []
125
+ head_concat = None
126
+
127
+ for i in range(self.num_heads):
128
+ q = tf.einsum('bij, ij -> bij', query, self.W_q[i])
129
+ k = tf.einsum('bij, ij -> bij', key, self.W_k[i])
130
+ v = tf.einsum('bij, ij -> bij', value, self.W_v[i])
131
+
132
+ self.head_vectors += [scaled_dot_product_attention(q, k, v)]
133
+
134
+
135
+ head_concat = tf.concat(self.head_vectors, -2)
136
+ #print(tf.shape(head_concat))
137
+ output =tf.einsum('bij, kj -> bkj', head_concat, self.W_o)
138
+
139
+
140
+ return output
141
+
142
+ class Decoder(tf.keras.layers.Layer):
143
+ def __init__(self, num_heads = 8, key_dim = 64, key_embedding = 512, GPT_attention = False):
144
+ super(Decoder, self).__init__()
145
+
146
+ self.num_heads = num_heads
147
+ self.key_dim = key_dim
148
+ self.key_embedding = key_embedding
149
+ if GPT_attention:
150
+ self.attention = GPT_Attention(key_embedding, key_embedding, num_heads)
151
+ else:
152
+ self.attention = MultiHeadAttention(num_heads = num_heads, key_dim = key_dim, key_embedding = key_embedding)
153
+ self.normalize1 = tf.keras.layers.LayerNormalization(axis = -2)
154
+ self.normalize2 = tf.keras.layers.LayerNormalization(axis = -2)
155
+
156
+
157
+ def build(self, input_shape):
158
+ #print(input_shape)
159
+
160
+ self.x1 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='vec1')
161
+ self.x2 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='vec2')
162
+
163
+ self.y1 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='bias1')
164
+ self.y2 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='bias2')
165
+
166
+ def call(self, inputs):
167
+
168
+ first_sublayer_output = self.attention((inputs, inputs, inputs))
169
+ first_sublayer_output = self.normalize1(first_sublayer_output + inputs)
170
+
171
+ first_nn = tf.einsum('bij, ij -> bij', first_sublayer_output, self.x1) + self.y1
172
+ first_nn = tf.keras.activations.relu(first_nn, alpha=0.0, max_value=None, threshold=0.0)
173
+ second_nn = tf.einsum('bij, ij -> bij', first_nn, self.x2) + self.y2
174
+
175
+ second_sublayer_output = self.normalize2(second_nn + first_sublayer_output)
176
+
177
+
178
+
179
+ return second_sublayer_output
180
+
181
+ def positional_function(words, embedding):
182
+ pos = np.zeros((words, embedding))
183
+
184
+ for i in range(words):
185
+ for j in range(embedding):
186
+ if j%2 == 0:
187
+ pos[i, j] = math.sin(i/pow(10000, 2*j/(512)))
188
+ else:
189
+ pos[i, j] = math.cos(i/pow(10000, 2*j/(512)))
190
+
191
+ return pos
192
+
193
+
194
+ class PositionalEmbedding(tf.keras.layers.Layer):
195
+ def __init__(self, positional_function = positional_function, embedding_size = 512, words = 64):
196
+ super(PositionalEmbedding, self).__init__()
197
+ self.embedding_size = embedding_size
198
+ self.words = words
199
+ self.pos_mat = tf.cast(tf.convert_to_tensor(positional_function(self.words, self.embedding_size)), tf.float32)
200
+
201
+ def build(self, input_sizes):
202
+ print(input_sizes)
203
+
204
+ def call(self, inputs):
205
+ embed = tf.einsum("bij, ij -> bij", inputs, self.pos_mat)
206
+ return embed
207
+
208
+ def generate_output(model, vectorizer, text_size = 70, gpt_input = 64, input_sequence = []):
209
+
210
+ if input_sequence == []:
211
+ input_sequence = tf.zeros((1, gpt_input)).numpy()
212
+
213
+ text = tf.zeros((1, text_size)).numpy()
214
+ text[0][: gpt_input] = input_sequence[0][: gpt_input]
215
+
216
+ GPT = model
217
+
218
+
219
+ for i in tqdm(range(gpt_input, text_size)):
220
+ #print("Iteration number:" + str(i))
221
+ output = tf.argmax(GPT(input_sequence), -1).numpy()
222
+ text[0][i - 1] = output
223
+ input_sequence = text[0][i - gpt_input : i].reshape(1, gpt_input)
224
+
225
+ op = [vectorizer.get_vocabulary()[int(text[0][i])] for i in range(len(text[0]))]
226
+ return ' '.join(op)
MinimalGPT_2.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tensorflow as tf
4
+ from tqdm import tqdm
5
+ from GPT import *
6
+ import pickle
7
+ import argparse
8
+ import sys
9
+
10
+
11
+
12
+ def save_module(save_weights, model, vectorizer, save_tokenizer):
13
+
14
+ # Save the GPT Model
15
+ with open(save_weights, 'wb') as file:
16
+ pickle.dump(model.weights, file)
17
+
18
+ #Save the Vectorizer Model
19
+ vocabulary = vectorizer.get_vocabulary()
20
+
21
+ # Encode the vocabulary as JSON-compatible strings
22
+ encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary]
23
+ encoded_vocabulary = encoded_vocabulary[2:]
24
+
25
+ # Save the encoded vocabulary to a JSON file
26
+ with open(save_tokenizer, 'w') as f:
27
+ json.dump(encoded_vocabulary, f)
28
+ print("Vocabulary size saved: " + str(len(encoded_vocabulary)))
29
+
30
+
31
+
32
+
33
+
34
+ def read_file(f, vectorizer, chunk_size = 1024, starting_chunk = 0, ending_chunk = 5, gpt_input = 10):
35
+ i = 0
36
+ chunk = []
37
+
38
+ while True:
39
+ data = f.read(chunk_size)
40
+
41
+ if not data or i > ending_chunk:
42
+ break
43
+
44
+ if i >= starting_chunk and i <= ending_chunk:
45
+ file_contents = data.split()
46
+ input_tokens, output_tokens = [], []
47
+ for j in range(len(file_contents) - gpt_input - 1):
48
+ input_tokens += [file_contents[j : j + gpt_input]]
49
+ output_tokens += [file_contents[j + gpt_input]]
50
+
51
+
52
+ X = [' '.join(input_tokens[j]) for j in range(len(input_tokens))]
53
+ Y = output_tokens
54
+
55
+ X = vectorizer(X)
56
+ Y = vectorizer(Y)
57
+
58
+ output = tf.concat([X, Y], 1)
59
+
60
+ yield output
61
+
62
+ i += 1
63
+
64
+
65
+ def get_model(gpt_input, d_model, h, vocab_size, decoder_stacks, GPT_attention):
66
+ input_words = tf.keras.layers.Input((gpt_input))
67
+ embedding = tf.keras.layers.Embedding(vocab_size + 2, d_model)(input_words)
68
+ positional_enc = PositionalEmbedding(words = gpt_input, embedding_size = d_model)(embedding)
69
+ decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(positional_enc)
70
+
71
+ for _ in range(decoder_stacks - 1):
72
+ decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(decoder)
73
+
74
+ decoder = tf.keras.layers.Flatten()(decoder)
75
+ linear_layer = tf.keras.layers.Dense(vocab_size + 3)(decoder)
76
+ softmax = tf.nn.softmax(linear_layer)
77
+ GPT = tf.keras.Model(inputs = input_words, outputs = softmax)
78
+
79
+ return GPT
80
+
81
+
82
+ def MinimalGPT(data_path='.',
83
+ learning_rate=0,
84
+ output_length=0,
85
+ epochs = 1,
86
+ batch_size = 1,
87
+ gpt_input=10,
88
+ d_model=128,
89
+ h=8,
90
+ decoder_stacks=1,
91
+ starting_chunk = 0,
92
+ ending_chunk = 5,
93
+ chunk_size = 10,
94
+ token_end=40000,
95
+ vocabulary_start = 0,
96
+ vocabulary_end = 40000,
97
+ save=False,
98
+ load_tokenizer=None,
99
+ load_weights=None,
100
+ save_tokenizer=None,
101
+ save_weights=None,
102
+ optimizer=None,
103
+ inference_only = False,
104
+ return_model_and_vectorizer = False,
105
+ return_model_and_vectorizer_and_output = False,
106
+ GPT_attention = False,
107
+ TPU = False):
108
+
109
+ if chunk_size:
110
+ chunk_size *= 1024
111
+
112
+
113
+ if inference_only == False:
114
+ with open(data_path, 'r', encoding = 'utf-8') as file:
115
+ corpus = file.read()
116
+ #file_contents = corpus.split()[token_start : token_end]
117
+ #print("Total tokens: " + str(len(file_contents)))
118
+
119
+
120
+ if load_tokenizer:
121
+ with open(load_tokenizer, 'r') as f:
122
+ encoded_vocabulary = json.load(f)
123
+
124
+ # Decode the encoded vocabulary to original strings
125
+ vocabulary = [word.encode('utf-8').decode('unicode_escape') for word in encoded_vocabulary]
126
+ vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace')
127
+ vectorizer.set_vocabulary(vocabulary)
128
+ vocab_size = vectorizer.vocabulary_size()
129
+
130
+ else:
131
+ vocab = []
132
+ for word in tqdm(corpus.split()[vocabulary_start : vocabulary_end]):
133
+ vocab += [word]
134
+ vocab = list(set(vocab))
135
+ vocab_size = len(vocab)
136
+ vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace', vocabulary = vocab)
137
+ print('New Vectorizer created successfully...')
138
+ print("Vocabulary Size: " + str(vocab_size))
139
+ del corpus
140
+
141
+
142
+ #if inference_only == False:
143
+ # input_tokens, output_tokens = [], []
144
+ # for i in tqdm(range(len(file_contents) - gpt_input - 1)):
145
+ # input_tokens += [file_contents[i : i + gpt_input]]
146
+ # output_tokens += [file_contents[i + gpt_input]]
147
+
148
+
149
+ # X = [' '.join(input_tokens[i]) for i in tqdm(range(len(input_tokens)))]
150
+ # Y = output_tokens
151
+
152
+ # del corpus
153
+
154
+ # X = vectorizer(X)
155
+ # Y = vectorizer(Y)
156
+
157
+ if load_weights:
158
+ model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size - 2, GPT_attention = GPT_attention)
159
+
160
+ with open(load_weights, 'rb') as file:
161
+ W = pickle.load(file)
162
+ model.set_weights(W)
163
+ else:
164
+ model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size, GPT_attention = GPT_attention)
165
+
166
+
167
+ print(model.summary())
168
+
169
+
170
+ if inference_only == False:
171
+ # Compile the model
172
+ if not optimizer:
173
+ model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy')
174
+ else:
175
+ model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
176
+
177
+ # Train the model
178
+ if learning_rate > 0:
179
+
180
+ for epoch in tqdm(range(epochs)):
181
+
182
+ with open(data_path, 'r', encoding='utf-8') as f:
183
+ chunk_number = 1
184
+ for chunk in read_file(f,
185
+ vectorizer,
186
+ chunk_size,
187
+ starting_chunk,
188
+ ending_chunk,
189
+ gpt_input):
190
+ print('Chunk_size: ' + str(chunk.shape[0]))
191
+ model.fit(chunk[:, :gpt_input], tf.reshape(chunk[:, -1], (-1, 1)), batch_size = batch_size, epochs=1)
192
+ print("Chunk Number " + str(chunk_number) + "/" +str(ending_chunk - starting_chunk + 1) + " processed!")
193
+ chunk_number += 1
194
+
195
+
196
+ # Print the output of the Model
197
+ output_seq = generate_output(gpt_input = gpt_input, model = model, vectorizer = vectorizer, text_size = output_length, input_sequence = [])
198
+
199
+ if save == True and TPU == False:
200
+ print('Saveeeeee')
201
+
202
+ save_module(save_weights, model, vectorizer, save_tokenizer)
203
+
204
+ if save == True and TPU == True:
205
+
206
+ return save_weights, model, vectorizer, save_tokenizer, output_seq
207
+ # Save the GPT Model
208
+ #with open(save_weights, 'wb') as file:
209
+ # pickle.dump(model.weights, file)
210
+
211
+ #Save the Vectorizer Model
212
+ #vocabulary = vectorizer.get_vocabulary()
213
+
214
+ # Encode the vocabulary as JSON-compatible strings
215
+ #encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary]
216
+ #encoded_vocabulary = encoded_vocabulary[2:]
217
+
218
+ # Save the encoded vocabulary to a JSON file
219
+ #with open(save_tokenizer, 'w') as f:
220
+ # json.dump(encoded_vocabulary, f)
221
+ # print("Vocabulary size saved: " + str(len(encoded_vocabulary)))
222
+
223
+
224
+ if return_model_and_vectorizer:
225
+ return model, vectorizer
226
+ elif return_model_and_vectorizer_and_output:
227
+ return model, vectorizer, output_seq.replace('@@ ', '')
228
+ else:
229
+ return output_seq.replace('@@ ', '')
230
+
231
+
232
+
233
+ # Example code to execute when the script file is called
234
+
235
+ def main():
236
+ print("This code is executed when the script file is called directly.")
237
+
238
+ # Check if the script is being run as the main module
239
+ if __name__ == '__main__':
240
+ parser = argparse.ArgumentParser()
241
+ parser.add_argument('-d', '--data-path', help='File: Corresponding to corpus or training text [String]')
242
+ parser.add_argument('-l', '--learning-rate', help='Float: Learning Rate. The model will train ONLY IF the rate is > 0, skip otherwise [Float]', type=float)
243
+ parser.add_argument('-ol', '--output-length', help='Length of the output sequence to be generated', type=int)
244
+ parser.add_argument('-e', '--epochs', help='Number of training Epochs [Int]', type=int)
245
+ parser.add_argument('-b', '--batch-size', help='Size of each batch [Int]', type=int)
246
+ parser.add_argument('-s', '--gpt-input', help='Number of Tokens of text the model inputs at a time [Int]', type=int)
247
+ parser.add_argument('-dm', '--d-model', help='Embedding layer output dimensions [Int]', type=int)
248
+ parser.add_argument('-p', '--multi-head', help='Number of Multi-head Attention layer in parallel [Int]', type=int)
249
+ parser.add_argument('-ds', '--decoder-stacks', help='Number of stacked Decoder layer [Int]', type=int)
250
+ parser.add_argument('-sc', '--chunk-start', help='The chunk number in the corpus to mark it as the starting point of the training [Int]', type=int)
251
+ parser.add_argument('-ec', '--chunk-end', help='The chunk number in the corpus to mark it as the end point of the training [Int]', type=int)
252
+ parser.add_argument('-csz', '--chunk-size', help='The size of each chunk in KB.', type=int)
253
+ parser.add_argument('-vs', '--vocabulary-start', help='Token number from the corpus to mark the starting point of vocabulary data [Int]', type=int)
254
+ parser.add_argument('-ve', '--vocabulary-end', help='Token number from the corpus to mark the end point of vocabulary data [Int]', type=int)
255
+ parser.add_argument('-sd', '--save', help='Save the Model and Vectorizer data to disk [True/False]', action='store_true')
256
+ parser.add_argument('-lt', '--load-tokenizer', help='File: Vectorization layer [File]')
257
+ parser.add_argument('-lw', '--load-weights', help='File: Model Weights [File]')
258
+ parser.add_argument('-st', '--save-tokenizer', help='File: Saving Vectorizer File [File]')
259
+ parser.add_argument('-sw', '--save-weights', help='File: Saving Model Weights[File]')
260
+ parser.add_argument('-ot', '--optimizer', help='Optimizer consistent to TensorFlow optimizer class [tf.keras.optimizers]')
261
+ parser.add_argument('-i', '--inference-only', help='Only Print the output of the model in Inference Mode [True/False]', action='store_true')
262
+ parser.add_argument('-mv', '--model-vectorizer', help='Return Model, Vectorizer Tuple [True/False]', action='store_true')
263
+ parser.add_argument('-mvo', '--model-vectorizer-output', help='Return Model, Vectorizer, Output Tuple [True/False]', action='store_true')
264
+ parser.add_argument('-ga', '--gpt-style-attention', help='Uses GPT-styled attention. Note: (d-model) parameter should be divisible by (multi-head), otherwise the program will throw an error! [True/False]', action='store_true')
265
+ parser.add_argument('-tpu', '--TPU', help='Use Tensor Processor Units (Distributed Learning)', action='store_true')
266
+
267
+
268
+ args = parser.parse_args()
269
+
270
+
271
+ data_path = args.data_path
272
+ learning_rate = args.learning_rate
273
+ output_length = args.output_length
274
+ epochs = args.epochs
275
+ batch_size = args.batch_size
276
+ gpt_input = args.gpt_input
277
+ d_model = args.d_model
278
+ h = args.multi_head
279
+ stacks = args.decoder_stacks
280
+ chunk_start = args.chunk_start
281
+ chunk_end = args.chunk_end
282
+ chunk_size = args.chunk_size
283
+ vocabulary_start = args.vocabulary_start
284
+ vocabulary_end = args.vocabulary_end
285
+ save = args.save
286
+ load_tokenizer = args.load_tokenizer
287
+ load_weights = args.load_weights
288
+ save_tokenizer = args.save_tokenizer
289
+ save_weights = args.save_weights
290
+ optimizer = args.optimizer
291
+ inference_only = args.inference_only
292
+ model_and_vectorizer = args.model_vectorizer
293
+ GPT_attention = args.gpt_style_attention
294
+ model_vectorizer_output = args.model_vectorizer_output
295
+
296
+
297
+
298
+ configuration = {
299
+ 'data_path': args.data_path,
300
+ 'learning_rate': args.learning_rate,
301
+ 'output_length': args.output_length,
302
+ 'epochs': args.epochs,
303
+ 'batch_size': args.batch_size,
304
+ 'gpt_input': args.gpt_input,
305
+ 'd_model': args.d_model,
306
+ 'h': args.multi_head,
307
+ 'stacks': args.decoder_stacks,
308
+ 'chunk_start': args.chunk_start,
309
+ 'chunk_end': args.chunk_end,
310
+ 'chunk_size': args.chunk_size,
311
+ 'vocabulary_start': args.vocabulary_start,
312
+ 'vocabulary_end': args.vocabulary_end,
313
+ 'save': args.save,
314
+ 'load_tokenizer': args.load_tokenizer,
315
+ 'load_weights': args.load_weights,
316
+ 'save_tokenizer': args.save_tokenizer,
317
+ 'save_weights': args.save_weights,
318
+ 'optimizer': args.optimizer,
319
+ 'inference_only': args.inference_only,
320
+ 'model_and_vectorizer': args.model_vectorizer,
321
+ 'model_vectorizer_output': args.model_vectorizer_output,
322
+ 'GPT_Attention' : args.gpt_style_attention
323
+ }
324
+
325
+ # Save the configuration to a JSON file
326
+ with open('last-configuration.json', 'w') as file:
327
+ json.dump(configuration, file)
328
+
329
+
330
+
331
+ if args.TPU == True:
332
+
333
+ resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
334
+ tf.config.experimental_connect_to_cluster(resolver)
335
+ # This is the TPU initialization code that has to be at the beginning.
336
+ tf.tpu.experimental.initialize_tpu_system(resolver)
337
+ print("All devices: ", tf.config.list_logical_devices('TPU'))
338
+
339
+
340
+ strategy = tf.distribute.TPUStrategy(resolver)
341
+
342
+ with strategy.scope():
343
+
344
+ output = MinimalGPT(data_path = data_path,
345
+ learning_rate = learning_rate,
346
+ output_length = output_length,
347
+ epochs = epochs,
348
+ batch_size = batch_size,
349
+ gpt_input = gpt_input,
350
+ d_model = d_model,
351
+ h = h,
352
+ decoder_stacks = stacks,
353
+ starting_chunk = chunk_start,
354
+ ending_chunk = chunk_end,
355
+ chunk_size = chunk_size,
356
+ vocabulary_start = vocabulary_start,
357
+ vocabulary_end = vocabulary_end,
358
+ save = save,
359
+ load_tokenizer = load_tokenizer,
360
+ load_weights = load_weights,
361
+ save_tokenizer = save_tokenizer,
362
+ save_weights = save_weights,
363
+ optimizer = optimizer,
364
+ inference_only = inference_only,
365
+ return_model_and_vectorizer = model_and_vectorizer,
366
+ return_model_and_vectorizer_and_output = model_vectorizer_output,
367
+ GPT_attention = GPT_attention,
368
+ TPU = True)
369
+
370
+ save_module(output[0], output[1], output[2], output[3])
371
+
372
+ print(output[4])
373
+ sys.exit(0)
374
+
375
+
376
+ output = MinimalGPT(data_path = data_path,
377
+ learning_rate = learning_rate,
378
+ output_length = output_length,
379
+ epochs = epochs,
380
+ batch_size = batch_size,
381
+ gpt_input = gpt_input,
382
+ d_model = d_model,
383
+ h = h,
384
+ decoder_stacks = stacks,
385
+ starting_chunk = chunk_start,
386
+ ending_chunk = chunk_end,
387
+ chunk_size = chunk_size,
388
+ vocabulary_start = vocabulary_start,
389
+ vocabulary_end = vocabulary_end,
390
+ save = save,
391
+ load_tokenizer = load_tokenizer,
392
+ load_weights = load_weights,
393
+ save_tokenizer = save_tokenizer,
394
+ save_weights = save_weights,
395
+ optimizer = optimizer,
396
+ inference_only = inference_only,
397
+ return_model_and_vectorizer = model_and_vectorizer,
398
+ return_model_and_vectorizer_and_output = model_vectorizer_output,
399
+ GPT_attention = GPT_attention,
400
+ TPU = False)
401
+ print(output)