Commit #1: GEM_1o_Aug trained

Browse files

Files changed (26) hide show

GEM_1o_Aug_15.pt +3 -0
Testings/testing.py +63 -0
configs/__pycache__/config.cpython-310.pyc +0 -0
configs/config.py +28 -0
generate.py +37 -0
models/__init__.py +0 -0
models/__pycache__/__init__.cpython-310.pyc +0 -0
models/__pycache__/gem_model.cpython-310.pyc +0 -0
models/gem_model.py +55 -0
requirements.txt +7 -0
tokenizer-merges.txt +0 -0
tokenizer-vocab.json +0 -0
tokenizer/gem_tokenizer.json +0 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +5 -0
tokenizer/tokenizer.json +0 -0
tokenizer/tokenizer_config.json +19 -0
tokenizer/vocab.json +0 -0
train.py +86 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/data_preprocessing.cpython-310.pyc +0 -0
utils/__pycache__/tokenizer.cpython-310.pyc +0 -0
utils/data_preprocessing.py +40 -0
utils/text_generation.py +18 -0
utils/tokenizer.py +23 -0

GEM_1o_Aug_15.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b753b1132c3c9fdc9bbdedd57d1f964d1c45534860b6416192fcc00a832df86
+size 2024369114

Testings/testing.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import sys
+import os
+# Add the parent directory of the model folder to the system path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../models')))
+from gem_model import GEM
+# Configuration parameters for GEM
+vocab_size = 50001  # Example vocab size, adjust if necessary
+d_model = 1024      # Dimension of the model
+n_heads = 32        # Number of attention heads
+d_ff = 4096         # Dimension of the feedforward network
+n_layers = 32       # Number of transformer layers
+dropout = 0.1       # Dropout rate
+# Initialize the model
+model = GEM(vocab_size, d_model, n_heads, d_ff, n_layers, dropout)
+# Load pre-trained weights
+model_path = '/content/drive/MyDrive/GEM_Project/GEM_1o_Aug_15.pt'
+model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+# Set the model to evaluation mode
+model.eval()
+# Define a function to convert text to token IDs (example)
+def text_to_ids(tokenizer, text):
+    # Implement this function based on your tokenizer's method
+    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+# Define a tokenizer or token conversion function (example placeholder)
+class DummyTokenizer:
+    def tokenize(self, text):
+        # Simple tokenization example, replace with actual tokenizer
+        return text.split()
+    def convert_tokens_to_ids(self, tokens):
+        # Simple mapping example, replace with actual ID mapping
+        return [ord(token[0]) % 50000 for token in tokens]
+# Initialize tokenizer
+tokenizer = DummyTokenizer()
+# Test input
+test_prompt = "This is a test."
+test_input_ids = torch.tensor(text_to_ids(tokenizer, test_prompt), dtype=torch.long).unsqueeze(0)  # Add batch dimension
+attention_mask = torch.ones(test_input_ids.shape, dtype=torch.bool)
+# Perform a forward pass
+with torch.no_grad():
+    outputs = model(test_input_ids, attention_mask)
+    print("Model outputs:")
+    print(outputs)
+# Test the generate method
+generation_prompt = "Once upon a time"
+input_ids = torch.tensor(text_to_ids(tokenizer, generation_prompt), dtype=torch.long).unsqueeze(0)  # Add batch dimension
+generated_ids = model.generate(input_ids, max_length=10, temperature=1.0)
+print("Generated IDs:")
+print(generated_ids)

configs/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (803 Bytes). View file

configs/config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+MODEL_CONFIG = {
+    'VOCAB_SIZE': 50000,
+    'D_MODEL': 1024,
+    'N_HEADS': 32,
+    'D_FF': 4096,
+    'N_LAYERS': 32,
+    'MAX_SEQ_LEN': 512,
+    'BATCH_SIZE': 32,
+    'LEARNING_RATE': 1e-4,
+    'NUM_EPOCHS': 20,
+    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
+    'WARMUP_STEPS': 4000,
+    'ADAM_EPSILON': 1e-8,
+    'WEIGHT_DECAY': 0.01,
+    'GRADIENT_ACCUMULATION_STEPS': 2,
+    'MAX_GRAD_NORM': 1.0,
+    'DROPOUT': 0.1,
+}
+TRAINING_CONFIG = {
+    'CHECKPOINT_SAVE_STEPS': 5000,
+    'LOGGING_STEPS': 100,
+    'EVAL_STEPS': 1000,
+    'SAVE_TOTAL_LIMIT': 5
+}

generate.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+from models.gem_model import GEM
+from utils.data_preprocessing import load_tokenizer
+from configs.config import MODEL_CONFIG
+def generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7):
+    device = torch.device(MODEL_CONFIG['DEVICE'])
+    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
+    generated = model.generate(input_ids, max_length=max_length, temperature=temperature)
+    return tokenizer.decode(generated[0], skip_special_tokens=True)
+def main():
+    device = torch.device(MODEL_CONFIG['DEVICE'])
+    tokenizer = load_tokenizer()
+    model = GEM(
+        vocab_size=MODEL_CONFIG['VOCAB_SIZE'],
+        d_model=MODEL_CONFIG['D_MODEL'],
+        n_heads=MODEL_CONFIG['N_HEADS'],
+        d_ff=MODEL_CONFIG['D_FF'],
+        n_layers=MODEL_CONFIG['N_LAYERS'],
+        max_seq_len=MODEL_CONFIG['MAX_SEQ_LEN'],
+        dropout=MODEL_CONFIG['DROPOUT']
+    ).to(device)
+    checkpoint = torch.load('final_model/model.pt')
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    prompt = "Once upon a time"
+    generated_text = generate_text(model, tokenizer, prompt, max_length=100)
+    print(f"Generated text:\n{generated_text}")
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

File without changes

models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (146 Bytes). View file

models/__pycache__/gem_model.cpython-310.pyc ADDED Viewed

Binary file (2.59 kB). View file

models/gem_model.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=512, dropout=0.1):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
+        pe = torch.zeros(max_len, 1, d_model)
+        pe[:, 0, 0::2] = torch.sin(position * div_term)
+        pe[:, 0, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+class GEM(nn.Module):
+    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, dropout=0.1):
+        super(GEM, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
+        encoder_layers = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True)
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
+        self.fc_out = nn.Linear(d_model, vocab_size)
+        self.d_model = d_model
+    def forward(self, input_ids, attention_mask=None):
+        x = self.embedding(input_ids) * math.sqrt(self.d_model)
+        x = self.positional_encoding(x)
+        if attention_mask is not None:
+            # Ensure attention_mask is in the shape (batch_size, sequence_length)
+            # Convert attention_mask to (batch_size, sequence_length) format
+            attention_mask = attention_mask.bool()  # Ensure it's a boolean tensor
+            x = self.transformer_encoder(x, src_key_padding_mask=attention_mask)
+        else:
+            x = self.transformer_encoder(x)
+        x = self.fc_out(x)
+        return x
+    def generate(self, input_ids, max_length, temperature=1.0):
+        self.eval()
+        with torch.no_grad():
+            for _ in range(max_length - input_ids.size(1)):
+                outputs = self(input_ids)
+                next_token_logits = outputs[:, -1, :] / temperature
+                next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
+                input_ids = torch.cat([input_ids, next_token], dim=-1)
+        return input_ids

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+transformers
+datasets
+tensorboard
+tokenizers
+tqdm
+wandb

tokenizer-merges.txt ADDED Viewed