Spaces:

abhaskumarsinha
/

MinimalGPT-Felis_Catus

Sleeping

App Files Files Community

abhaskumarsinha commited on Jun 12, 2023

Commit

d891407

•

1 Parent(s): f36f46c

Upload 2 files

Browse files

Files changed (2) hide show

GPT.py +226 -0
MinimalGPT_2.py +401 -0

GPT.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import math
+from tqdm import tqdm
+def scaled_dot_product_attention(q, k, v):
+    # calculate the dot product of query and key
+    dot_product = tf.matmul(q, k, transpose_b=True)
+    # scale the dot product
+    scaled_dot_product = dot_product / tf.math.sqrt(tf.cast(tf.shape(k)[-1], dtype=tf.float32))
+    # apply softmax activation to obtain attention weights
+    attention_weights = tf.nn.softmax(scaled_dot_product, axis=-1)
+    # compute the weighted sum of the value vectors with attention weights
+    output = tf.matmul(attention_weights, v)
+    return output
+class LinearLayer(tf.keras.layers.Layer):
+    def __init__(self, ix, ox):
+        super().__init__()
+        self.ix = ix
+        self.ox = ox
+    def build(self, input_shapes):
+        self.w1 = self.add_weight(shape=(self.ix, self.ox))
+        self.b1 = self.add_weight(shape=(1, self.ox))
+    def call(self, inputs):
+        bz, key = tf.shape(inputs)[0], tf.shape(inputs)[1]
+        inputs = tf.reshape(inputs, (-1, self.ix))
+        inputs = tf.matmul(inputs, self.w1) + self.b1
+        inputs = tf.reshape(inputs, (bz, key, self.ox))
+        return inputs
+class split_heads(tf.keras.layers.Layer):
+    def __init__(self, num_heads = 10):
+        super().__init__()
+        self.num_heads = num_heads
+    def call(self, inputs):
+        bz, key = tf.shape(inputs)[0], tf.shape(inputs)[1]
+        inputs = tf.reshape(inputs, (bz, key, self.num_heads, -1))
+        inputs = tf.transpose(inputs, (0, 2, 1, 3))
+        return inputs
+class merge_heads(tf.keras.layers.Layer):
+    def __init__(self):
+        super().__init__()
+    def call(self, inputs):
+        bz, key = tf.shape(inputs)[0], tf.shape(inputs)[2]
+        inputs = tf.transpose(inputs, (0, 2, 1, 3))
+        inputs = tf.reshape(inputs, (bz, key, -1))
+        return inputs
+class GPT_Attention(tf.keras.layers.Layer):
+    def __init__(self, ix, ox, num_heads):
+        super().__init__()
+        self.ix = ix
+        self.ox = ox
+        self.num_heads = num_heads
+        self.linear1 = LinearLayer(self.ix, self.ox * 3)
+        self.split = split_heads(num_heads = self.num_heads)
+        self.merge = merge_heads()
+        self.linear2 = LinearLayer(self.ox, self.ix)
+        if self.ox % self.num_heads != 0:
+            raise ValueError('The value ox = '+ str(self.ox) +' SHOULD be divisible by number of heads provided')
+    def call(self, inputs):
+        if len(inputs) > 0:
+            inputs = inputs[0]
+        inputs = self.linear1(inputs)
+        k, q, v = tf.split(inputs, 3, axis = -1)
+        k = self.split(k)
+        q = self.split(q)
+        v = self.split(v)
+        #k, q, v = tf.split(inputs, 3, axis = -1)
+        inputs = scaled_dot_product_attention(k, q, v)
+        inputs = self.merge(inputs)
+        inputs = self.linear2(inputs)
+        return inputs
+class MultiHeadAttention(tf.keras.layers.Layer):
+    def __init__(self, num_heads = 8, key_dim = 64, key_embedding = 512):
+        super(MultiHeadAttention, self).__init__()
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.key_embedding = key_embedding
+        self.head_vectors = []
+    def build(self, input_shape):
+        #print(input_shape)
+        self.W_k = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='key')
+        self.W_q = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='query')
+        self.W_v = self.add_weight(shape=(self.num_heads, self.key_dim, self.key_embedding), name='value')
+        self.W_o = self.add_weight(shape=(self.key_dim, self.key_embedding))
+    def call(self, inputs):
+        query, key, value = inputs
+        self.head_vectors = []
+        head_concat = None
+        for i in range(self.num_heads):
+            q = tf.einsum('bij, ij -> bij', query, self.W_q[i])
+            k = tf.einsum('bij, ij -> bij', key, self.W_k[i])
+            v = tf.einsum('bij, ij -> bij', value, self.W_v[i])
+            self.head_vectors += [scaled_dot_product_attention(q, k, v)]
+        head_concat = tf.concat(self.head_vectors, -2)
+        #print(tf.shape(head_concat))
+        output =tf.einsum('bij, kj -> bkj', head_concat, self.W_o)
+        return output
+class Decoder(tf.keras.layers.Layer):
+    def __init__(self, num_heads = 8, key_dim = 64, key_embedding = 512, GPT_attention = False):
+        super(Decoder, self).__init__()
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.key_embedding = key_embedding
+        if GPT_attention:
+            self.attention = GPT_Attention(key_embedding, key_embedding, num_heads)
+        else:
+            self.attention = MultiHeadAttention(num_heads = num_heads, key_dim = key_dim, key_embedding = key_embedding)
+        self.normalize1 = tf.keras.layers.LayerNormalization(axis = -2)
+        self.normalize2 = tf.keras.layers.LayerNormalization(axis = -2)
+    def build(self, input_shape):
+        #print(input_shape)
+        self.x1 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='vec1')
+        self.x2 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='vec2')
+        self.y1 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='bias1')
+        self.y2 = self.add_weight(shape=(self.key_dim, self.key_embedding), name='bias2')
+    def call(self, inputs):
+        first_sublayer_output = self.attention((inputs, inputs, inputs))
+        first_sublayer_output = self.normalize1(first_sublayer_output + inputs)
+        first_nn = tf.einsum('bij, ij -> bij', first_sublayer_output, self.x1) + self.y1
+        first_nn = tf.keras.activations.relu(first_nn, alpha=0.0, max_value=None, threshold=0.0)
+        second_nn = tf.einsum('bij, ij -> bij', first_nn, self.x2) + self.y2
+        second_sublayer_output = self.normalize2(second_nn + first_sublayer_output)
+        return second_sublayer_output
+def positional_function(words, embedding):
+    pos = np.zeros((words, embedding))
+    for i in range(words):
+        for j in range(embedding):
+            if j%2 == 0:
+                pos[i, j] = math.sin(i/pow(10000, 2*j/(512)))
+            else:
+                pos[i, j] = math.cos(i/pow(10000, 2*j/(512)))
+    return pos
+class PositionalEmbedding(tf.keras.layers.Layer):
+    def __init__(self, positional_function = positional_function, embedding_size = 512, words = 64):
+        super(PositionalEmbedding, self).__init__()
+        self.embedding_size = embedding_size
+        self.words = words
+        self.pos_mat = tf.cast(tf.convert_to_tensor(positional_function(self.words, self.embedding_size)), tf.float32)
+    def build(self, input_sizes):
+        print(input_sizes)
+    def call(self, inputs):
+        embed = tf.einsum("bij, ij -> bij", inputs, self.pos_mat)
+        return embed
+def generate_output(model, vectorizer, text_size = 70, gpt_input = 64, input_sequence = []):
+    if input_sequence == []:
+        input_sequence = tf.zeros((1, gpt_input)).numpy()
+    text = tf.zeros((1, text_size)).numpy()
+    text[0][: gpt_input] = input_sequence[0][: gpt_input]
+    GPT = model
+    for i in tqdm(range(gpt_input, text_size)):
+        #print("Iteration number:" + str(i))
+        output = tf.argmax(GPT(input_sequence), -1).numpy()
+        text[0][i - 1] = output
+        input_sequence = text[0][i - gpt_input : i].reshape(1, gpt_input)
+    op = [vectorizer.get_vocabulary()[int(text[0][i])] for i in range(len(text[0]))]
+    return ' '.join(op)

MinimalGPT_2.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import os
+import json
+import tensorflow as tf
+from tqdm import tqdm
+from GPT import *
+import pickle
+import argparse
+import sys
+def save_module(save_weights, model, vectorizer, save_tokenizer):
+        # Save the GPT Model
+        with open(save_weights, 'wb') as file:
+            pickle.dump(model.weights, file)
+        #Save the Vectorizer Model
+        vocabulary = vectorizer.get_vocabulary()
+        # Encode the vocabulary as JSON-compatible strings
+        encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary]
+        encoded_vocabulary = encoded_vocabulary[2:]
+        # Save the encoded vocabulary to a JSON file
+        with open(save_tokenizer, 'w') as f:
+            json.dump(encoded_vocabulary, f)
+            print("Vocabulary size saved: " + str(len(encoded_vocabulary)))
+def read_file(f, vectorizer, chunk_size = 1024, starting_chunk = 0, ending_chunk = 5, gpt_input = 10):
+    i = 0
+    chunk = []
+    while True:
+        data = f.read(chunk_size)
+        if not data or i > ending_chunk:
+            break
+        if i >= starting_chunk and i <= ending_chunk:
+            file_contents = data.split()
+            input_tokens, output_tokens = [], []
+            for j in range(len(file_contents) - gpt_input - 1):
+                input_tokens += [file_contents[j : j + gpt_input]]
+                output_tokens += [file_contents[j + gpt_input]]
+            X = [' '.join(input_tokens[j]) for j in range(len(input_tokens))]
+            Y = output_tokens
+            X = vectorizer(X)
+            Y = vectorizer(Y)
+            output = tf.concat([X, Y], 1)
+            yield output
+        i += 1
+def get_model(gpt_input, d_model, h, vocab_size, decoder_stacks, GPT_attention):
+    input_words = tf.keras.layers.Input((gpt_input))
+    embedding = tf.keras.layers.Embedding(vocab_size + 2, d_model)(input_words)
+    positional_enc = PositionalEmbedding(words = gpt_input, embedding_size = d_model)(embedding)
+    decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(positional_enc)
+    for _ in range(decoder_stacks - 1):
+        decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(decoder)
+    decoder = tf.keras.layers.Flatten()(decoder)
+    linear_layer = tf.keras.layers.Dense(vocab_size + 3)(decoder)
+    softmax = tf.nn.softmax(linear_layer)
+    GPT = tf.keras.Model(inputs = input_words, outputs = softmax)
+    return GPT
+def MinimalGPT(data_path='.',
+               learning_rate=0,
+               output_length=0,
+               epochs = 1,
+               batch_size = 1,
+               gpt_input=10,
+               d_model=128,
+               h=8,
+               decoder_stacks=1,
+               starting_chunk = 0,
+               ending_chunk = 5,
+               chunk_size = 10,
+               token_end=40000,
+               vocabulary_start = 0,
+               vocabulary_end = 40000,
+               save=False,
+               load_tokenizer=None,
+               load_weights=None,
+               save_tokenizer=None,
+               save_weights=None,
+               optimizer=None,
+               inference_only = False,
+               return_model_and_vectorizer = False,
+               return_model_and_vectorizer_and_output = False,
+               GPT_attention = False,
+               TPU = False):
+    if chunk_size:
+        chunk_size *= 1024
+    if inference_only == False:
+        with open(data_path, 'r', encoding = 'utf-8') as file:
+            corpus = file.read()
+            #file_contents = corpus.split()[token_start : token_end]
+            #print("Total tokens: " + str(len(file_contents)))
+    if load_tokenizer:
+            with open(load_tokenizer, 'r') as f:
+                encoded_vocabulary = json.load(f)
+            # Decode the encoded vocabulary to original strings
+            vocabulary = [word.encode('utf-8').decode('unicode_escape') for word in encoded_vocabulary]
+            vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace')
+            vectorizer.set_vocabulary(vocabulary)
+            vocab_size = vectorizer.vocabulary_size()
+    else:
+        vocab = []
+        for word in tqdm(corpus.split()[vocabulary_start : vocabulary_end]):
+            vocab += [word]
+            vocab = list(set(vocab))
+        vocab_size = len(vocab)
+        vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace', vocabulary = vocab)
+        print('New Vectorizer created successfully...')
+        print("Vocabulary Size: " + str(vocab_size))
+        del corpus
+    #if inference_only == False:
+    #    input_tokens, output_tokens = [], []
+    #    for i in tqdm(range(len(file_contents) - gpt_input - 1)):
+    #        input_tokens += [file_contents[i : i + gpt_input]]
+    #        output_tokens += [file_contents[i + gpt_input]]
+    #    X = [' '.join(input_tokens[i]) for i in tqdm(range(len(input_tokens)))]
+    #    Y = output_tokens
+    #    del corpus
+    #   X = vectorizer(X)
+    #    Y = vectorizer(Y)
+    if load_weights:
+        model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size - 2, GPT_attention = GPT_attention)
+        with open(load_weights, 'rb') as file:
+            W = pickle.load(file)
+            model.set_weights(W)
+    else:
+        model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size, GPT_attention = GPT_attention)
+    print(model.summary())
+    if inference_only == False:
+        # Compile the model
+        if not optimizer:
+            model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy')
+        else:
+            model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
+        # Train the model
+        if learning_rate > 0:
+            for epoch in tqdm(range(epochs)):
+                with open(data_path, 'r', encoding='utf-8') as f:
+                    chunk_number = 1
+                    for chunk in read_file(f,
+                                           vectorizer,
+                                           chunk_size,
+                                           starting_chunk,
+                                           ending_chunk,
+                                           gpt_input):
+                        print('Chunk_size: ' + str(chunk.shape[0]))
+                        model.fit(chunk[:, :gpt_input], tf.reshape(chunk[:, -1], (-1, 1)), batch_size = batch_size, epochs=1)
+                        print("Chunk Number " + str(chunk_number) + "/" +str(ending_chunk - starting_chunk + 1) + " processed!")
+                        chunk_number += 1
+    # Print the output of the Model
+    output_seq = generate_output(gpt_input = gpt_input, model = model, vectorizer = vectorizer, text_size = output_length, input_sequence = [])
+    if save == True and TPU == False:
+        print('Saveeeeee')
+        save_module(save_weights, model, vectorizer, save_tokenizer)
+    if save == True and TPU == True:
+        return save_weights, model, vectorizer, save_tokenizer, output_seq
+        # Save the GPT Model
+        #with open(save_weights, 'wb') as file:
+        #    pickle.dump(model.weights, file)
+        #Save the Vectorizer Model
+        #vocabulary = vectorizer.get_vocabulary()
+        # Encode the vocabulary as JSON-compatible strings
+        #encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary]
+        #encoded_vocabulary = encoded_vocabulary[2:]
+        # Save the encoded vocabulary to a JSON file
+        #with open(save_tokenizer, 'w') as f:
+        #    json.dump(encoded_vocabulary, f)
+        #    print("Vocabulary size saved: " + str(len(encoded_vocabulary)))
+    if return_model_and_vectorizer:
+        return model, vectorizer
+    elif return_model_and_vectorizer_and_output:
+        return model, vectorizer, output_seq.replace('@@ ', '')
+    else:
+        return output_seq.replace('@@ ', '')
+# Example code to execute when the script file is called
+def main():
+    print("This code is executed when the script file is called directly.")
+# Check if the script is being run as the main module
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--data-path', help='File: Corresponding to corpus or training text [String]')
+    parser.add_argument('-l', '--learning-rate', help='Float: Learning Rate. The model will train ONLY IF the rate is > 0, skip otherwise [Float]', type=float)
+    parser.add_argument('-ol', '--output-length', help='Length of the output sequence to be generated', type=int)
+    parser.add_argument('-e', '--epochs', help='Number of training Epochs [Int]', type=int)
+    parser.add_argument('-b', '--batch-size', help='Size of each batch [Int]', type=int)
+    parser.add_argument('-s', '--gpt-input', help='Number of Tokens of text the model inputs at a time [Int]', type=int)
+    parser.add_argument('-dm', '--d-model', help='Embedding layer output dimensions [Int]', type=int)
+    parser.add_argument('-p', '--multi-head', help='Number of Multi-head Attention layer in parallel [Int]', type=int)
+    parser.add_argument('-ds', '--decoder-stacks', help='Number of stacked Decoder layer [Int]', type=int)
+    parser.add_argument('-sc', '--chunk-start', help='The chunk number in the corpus to mark it as the starting point of the training [Int]', type=int)
+    parser.add_argument('-ec', '--chunk-end', help='The chunk number in the corpus to mark it as the end point of the training [Int]', type=int)
+    parser.add_argument('-csz', '--chunk-size', help='The size of each chunk in KB.', type=int)
+    parser.add_argument('-vs', '--vocabulary-start', help='Token number from the corpus to mark the starting point of vocabulary data [Int]', type=int)
+    parser.add_argument('-ve', '--vocabulary-end', help='Token number from the corpus to mark the end point of vocabulary data [Int]', type=int)
+    parser.add_argument('-sd', '--save', help='Save the Model and Vectorizer data to disk [True/False]', action='store_true')
+    parser.add_argument('-lt', '--load-tokenizer', help='File: Vectorization layer [File]')
+    parser.add_argument('-lw', '--load-weights', help='File: Model Weights [File]')
+    parser.add_argument('-st', '--save-tokenizer', help='File: Saving Vectorizer File [File]')
+    parser.add_argument('-sw', '--save-weights', help='File: Saving Model Weights[File]')
+    parser.add_argument('-ot', '--optimizer', help='Optimizer consistent to TensorFlow optimizer class [tf.keras.optimizers]')
+    parser.add_argument('-i', '--inference-only', help='Only Print the output of the model in Inference Mode [True/False]', action='store_true')
+    parser.add_argument('-mv', '--model-vectorizer', help='Return Model, Vectorizer Tuple [True/False]', action='store_true')
+    parser.add_argument('-mvo', '--model-vectorizer-output', help='Return Model, Vectorizer, Output Tuple [True/False]', action='store_true')
+    parser.add_argument('-ga', '--gpt-style-attention', help='Uses GPT-styled attention. Note: (d-model) parameter should be divisible by (multi-head), otherwise the program will throw an error! [True/False]', action='store_true')
+    parser.add_argument('-tpu', '--TPU', help='Use Tensor Processor Units (Distributed Learning)', action='store_true')
+    args = parser.parse_args()
+    data_path = args.data_path
+    learning_rate = args.learning_rate
+    output_length = args.output_length
+    epochs = args.epochs
+    batch_size = args.batch_size
+    gpt_input = args.gpt_input
+    d_model = args.d_model
+    h = args.multi_head
+    stacks = args.decoder_stacks
+    chunk_start = args.chunk_start
+    chunk_end = args.chunk_end
+    chunk_size = args.chunk_size
+    vocabulary_start = args.vocabulary_start
+    vocabulary_end = args.vocabulary_end
+    save = args.save
+    load_tokenizer = args.load_tokenizer
+    load_weights = args.load_weights
+    save_tokenizer = args.save_tokenizer
+    save_weights = args.save_weights
+    optimizer = args.optimizer
+    inference_only = args.inference_only
+    model_and_vectorizer = args.model_vectorizer
+    GPT_attention = args.gpt_style_attention
+    model_vectorizer_output = args.model_vectorizer_output
+    configuration = {
+    'data_path': args.data_path,
+    'learning_rate': args.learning_rate,
+    'output_length': args.output_length,
+    'epochs': args.epochs,
+    'batch_size': args.batch_size,
+    'gpt_input': args.gpt_input,
+    'd_model': args.d_model,
+    'h': args.multi_head,
+    'stacks': args.decoder_stacks,
+    'chunk_start': args.chunk_start,
+    'chunk_end': args.chunk_end,
+    'chunk_size': args.chunk_size,
+    'vocabulary_start': args.vocabulary_start,
+    'vocabulary_end': args.vocabulary_end,
+    'save': args.save,
+    'load_tokenizer': args.load_tokenizer,
+    'load_weights': args.load_weights,
+    'save_tokenizer': args.save_tokenizer,
+    'save_weights': args.save_weights,
+    'optimizer': args.optimizer,
+    'inference_only': args.inference_only,
+    'model_and_vectorizer': args.model_vectorizer,
+    'model_vectorizer_output': args.model_vectorizer_output,
+    'GPT_Attention' : args.gpt_style_attention
+    }
+    # Save the configuration to a JSON file
+    with open('last-configuration.json', 'w') as file:
+        json.dump(configuration, file)
+    if args.TPU == True:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+        tf.config.experimental_connect_to_cluster(resolver)
+        # This is the TPU initialization code that has to be at the beginning.
+        tf.tpu.experimental.initialize_tpu_system(resolver)
+        print("All devices: ", tf.config.list_logical_devices('TPU'))
+        strategy = tf.distribute.TPUStrategy(resolver)
+        with strategy.scope():
+            output = MinimalGPT(data_path = data_path,
+                   learning_rate = learning_rate,
+                   output_length = output_length,
+                   epochs = epochs,
+                   batch_size = batch_size,
+                   gpt_input = gpt_input,
+                   d_model = d_model,
+                   h = h,
+                   decoder_stacks = stacks,
+                   starting_chunk = chunk_start,
+                   ending_chunk = chunk_end,
+                   chunk_size = chunk_size,
+                   vocabulary_start = vocabulary_start,
+                   vocabulary_end = vocabulary_end,
+                   save = save,
+                   load_tokenizer = load_tokenizer,
+                   load_weights = load_weights,
+                   save_tokenizer = save_tokenizer,
+                   save_weights = save_weights,
+                   optimizer = optimizer,
+                   inference_only = inference_only,
+                   return_model_and_vectorizer = model_and_vectorizer,
+                   return_model_and_vectorizer_and_output = model_vectorizer_output,
+                   GPT_attention = GPT_attention,
+                   TPU = True)
+        save_module(output[0], output[1], output[2], output[3])
+        print(output[4])
+        sys.exit(0)
+    output = MinimalGPT(data_path = data_path,
+                       learning_rate = learning_rate,
+                       output_length = output_length,
+                       epochs = epochs,
+                       batch_size = batch_size,
+                       gpt_input = gpt_input,
+                       d_model = d_model,
+                       h = h,
+                       decoder_stacks = stacks,
+                       starting_chunk = chunk_start,
+                       ending_chunk = chunk_end,
+                       chunk_size = chunk_size,
+                       vocabulary_start = vocabulary_start,
+                       vocabulary_end = vocabulary_end,
+                       save = save,
+                       load_tokenizer = load_tokenizer,
+                       load_weights = load_weights,
+                       save_tokenizer = save_tokenizer,
+                       save_weights = save_weights,
+                       optimizer = optimizer,
+                       inference_only = inference_only,
+                       return_model_and_vectorizer = model_and_vectorizer,
+                       return_model_and_vectorizer_and_output = model_vectorizer_output,
+                       GPT_attention = GPT_attention,
+                       TPU = False)
+    print(output)