Text Generation
English
instruction-following
reasoning
comethrusws commited on
Commit
d18eb09
·
verified ·
1 Parent(s): 83d6c4c

Commit #1: GEM_1o_Aug trained

Browse files
GEM_1o_Aug_15.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b753b1132c3c9fdc9bbdedd57d1f964d1c45534860b6416192fcc00a832df86
3
+ size 2024369114
Testings/testing.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import sys
4
+ import os
5
+
6
+ # Add the parent directory of the model folder to the system path
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../models')))
8
+
9
+ from gem_model import GEM
10
+
11
+ # Configuration parameters for GEM
12
+ vocab_size = 50001 # Example vocab size, adjust if necessary
13
+ d_model = 1024 # Dimension of the model
14
+ n_heads = 32 # Number of attention heads
15
+ d_ff = 4096 # Dimension of the feedforward network
16
+ n_layers = 32 # Number of transformer layers
17
+ dropout = 0.1 # Dropout rate
18
+
19
+ # Initialize the model
20
+ model = GEM(vocab_size, d_model, n_heads, d_ff, n_layers, dropout)
21
+
22
+ # Load pre-trained weights
23
+ model_path = '/content/drive/MyDrive/GEM_Project/GEM_1o_Aug_15.pt'
24
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
25
+
26
+ # Set the model to evaluation mode
27
+ model.eval()
28
+
29
+ # Define a function to convert text to token IDs (example)
30
+ def text_to_ids(tokenizer, text):
31
+ # Implement this function based on your tokenizer's method
32
+ return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
33
+
34
+ # Define a tokenizer or token conversion function (example placeholder)
35
+ class DummyTokenizer:
36
+ def tokenize(self, text):
37
+ # Simple tokenization example, replace with actual tokenizer
38
+ return text.split()
39
+
40
+ def convert_tokens_to_ids(self, tokens):
41
+ # Simple mapping example, replace with actual ID mapping
42
+ return [ord(token[0]) % 50000 for token in tokens]
43
+
44
+ # Initialize tokenizer
45
+ tokenizer = DummyTokenizer()
46
+
47
+ # Test input
48
+ test_prompt = "This is a test."
49
+ test_input_ids = torch.tensor(text_to_ids(tokenizer, test_prompt), dtype=torch.long).unsqueeze(0) # Add batch dimension
50
+ attention_mask = torch.ones(test_input_ids.shape, dtype=torch.bool)
51
+
52
+ # Perform a forward pass
53
+ with torch.no_grad():
54
+ outputs = model(test_input_ids, attention_mask)
55
+ print("Model outputs:")
56
+ print(outputs)
57
+
58
+ # Test the generate method
59
+ generation_prompt = "Once upon a time"
60
+ input_ids = torch.tensor(text_to_ids(tokenizer, generation_prompt), dtype=torch.long).unsqueeze(0) # Add batch dimension
61
+ generated_ids = model.generate(input_ids, max_length=10, temperature=1.0)
62
+ print("Generated IDs:")
63
+ print(generated_ids)
configs/__pycache__/config.cpython-310.pyc ADDED
Binary file (803 Bytes). View file
 
configs/config.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+ MODEL_CONFIG = {
5
+ 'VOCAB_SIZE': 50000,
6
+ 'D_MODEL': 1024,
7
+ 'N_HEADS': 32,
8
+ 'D_FF': 4096,
9
+ 'N_LAYERS': 32,
10
+ 'MAX_SEQ_LEN': 512,
11
+ 'BATCH_SIZE': 32,
12
+ 'LEARNING_RATE': 1e-4,
13
+ 'NUM_EPOCHS': 20,
14
+ 'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
15
+ 'WARMUP_STEPS': 4000,
16
+ 'ADAM_EPSILON': 1e-8,
17
+ 'WEIGHT_DECAY': 0.01,
18
+ 'GRADIENT_ACCUMULATION_STEPS': 2,
19
+ 'MAX_GRAD_NORM': 1.0,
20
+ 'DROPOUT': 0.1,
21
+ }
22
+
23
+ TRAINING_CONFIG = {
24
+ 'CHECKPOINT_SAVE_STEPS': 5000,
25
+ 'LOGGING_STEPS': 100,
26
+ 'EVAL_STEPS': 1000,
27
+ 'SAVE_TOTAL_LIMIT': 5
28
+ }
generate.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ from models.gem_model import GEM
4
+ from utils.data_preprocessing import load_tokenizer
5
+ from configs.config import MODEL_CONFIG
6
+
7
+ def generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7):
8
+ device = torch.device(MODEL_CONFIG['DEVICE'])
9
+ input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
10
+ generated = model.generate(input_ids, max_length=max_length, temperature=temperature)
11
+ return tokenizer.decode(generated[0], skip_special_tokens=True)
12
+
13
+ def main():
14
+ device = torch.device(MODEL_CONFIG['DEVICE'])
15
+
16
+ tokenizer = load_tokenizer()
17
+
18
+ model = GEM(
19
+ vocab_size=MODEL_CONFIG['VOCAB_SIZE'],
20
+ d_model=MODEL_CONFIG['D_MODEL'],
21
+ n_heads=MODEL_CONFIG['N_HEADS'],
22
+ d_ff=MODEL_CONFIG['D_FF'],
23
+ n_layers=MODEL_CONFIG['N_LAYERS'],
24
+ max_seq_len=MODEL_CONFIG['MAX_SEQ_LEN'],
25
+ dropout=MODEL_CONFIG['DROPOUT']
26
+ ).to(device)
27
+
28
+ checkpoint = torch.load('final_model/model.pt')
29
+ model.load_state_dict(checkpoint['model_state_dict'])
30
+ model.eval()
31
+
32
+ prompt = "Once upon a time"
33
+ generated_text = generate_text(model, tokenizer, prompt, max_length=100)
34
+ print(f"Generated text:\n{generated_text}")
35
+
36
+ if __name__ == "__main__":
37
+ main()
models/__init__.py ADDED
File without changes
models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (146 Bytes). View file
 
models/__pycache__/gem_model.cpython-310.pyc ADDED
Binary file (2.59 kB). View file
 
models/gem_model.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+ class PositionalEncoding(nn.Module):
7
+ def __init__(self, d_model, max_len=512, dropout=0.1):
8
+ super(PositionalEncoding, self).__init__()
9
+ self.dropout = nn.Dropout(p=dropout)
10
+
11
+ position = torch.arange(0, max_len).unsqueeze(1)
12
+ div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
13
+ pe = torch.zeros(max_len, 1, d_model)
14
+ pe[:, 0, 0::2] = torch.sin(position * div_term)
15
+ pe[:, 0, 1::2] = torch.cos(position * div_term)
16
+ self.register_buffer('pe', pe)
17
+
18
+ def forward(self, x):
19
+ x = x + self.pe[:x.size(0), :]
20
+ return self.dropout(x)
21
+
22
+ class GEM(nn.Module):
23
+ def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, dropout=0.1):
24
+ super(GEM, self).__init__()
25
+ self.embedding = nn.Embedding(vocab_size, d_model)
26
+ self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
27
+ encoder_layers = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True)
28
+ self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
29
+ self.fc_out = nn.Linear(d_model, vocab_size)
30
+ self.d_model = d_model
31
+
32
+ def forward(self, input_ids, attention_mask=None):
33
+ x = self.embedding(input_ids) * math.sqrt(self.d_model)
34
+ x = self.positional_encoding(x)
35
+
36
+ if attention_mask is not None:
37
+ # Ensure attention_mask is in the shape (batch_size, sequence_length)
38
+ # Convert attention_mask to (batch_size, sequence_length) format
39
+ attention_mask = attention_mask.bool() # Ensure it's a boolean tensor
40
+ x = self.transformer_encoder(x, src_key_padding_mask=attention_mask)
41
+ else:
42
+ x = self.transformer_encoder(x)
43
+
44
+ x = self.fc_out(x)
45
+ return x
46
+
47
+ def generate(self, input_ids, max_length, temperature=1.0):
48
+ self.eval()
49
+ with torch.no_grad():
50
+ for _ in range(max_length - input_ids.size(1)):
51
+ outputs = self(input_ids)
52
+ next_token_logits = outputs[:, -1, :] / temperature
53
+ next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
54
+ input_ids = torch.cat([input_ids, next_token], dim=-1)
55
+ return input_ids
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ tensorboard
5
+ tokenizers
6
+ tqdm
7
+ wandb
tokenizer-merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer-vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/gem_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": true,
15
+ "eos_token": "<|endoftext|>",
16
+ "model_max_length": 1024,
17
+ "tokenizer_class": "GPT2Tokenizer",
18
+ "unk_token": "<|endoftext|>"
19
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
train.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.optim as optim
4
+ from torch.nn import functional as F
5
+ from torch.utils.data import DataLoader
6
+ from tqdm import tqdm
7
+ import wandb
8
+ from transformers import get_linear_schedule_with_warmup
9
+ from utils.data_preprocessing import get_dataloader, load_tokenizer
10
+ from models.gem_model import GEM
11
+ from configs.config import MODEL_CONFIG, TRAINING_CONFIG
12
+
13
+ def train():
14
+ wandb.init(project="GEM_Project", config=MODEL_CONFIG, mode="offline")
15
+ print("WandB initialized in offline mode.")
16
+
17
+ tokenizer = load_tokenizer()
18
+ print("Tokenizer loaded.")
19
+
20
+ dataloader = get_dataloader('wikitext', 'wikitext-2-raw-v1', tokenizer, MODEL_CONFIG['MAX_SEQ_LEN'], MODEL_CONFIG['BATCH_SIZE'])
21
+ print("Dataloader created.")
22
+
23
+ model = GEM(
24
+ vocab_size=len(tokenizer),
25
+ d_model=MODEL_CONFIG['D_MODEL'],
26
+ n_heads=MODEL_CONFIG['N_HEADS'],
27
+ d_ff=MODEL_CONFIG['D_FF'],
28
+ n_layers=MODEL_CONFIG['N_LAYERS'],
29
+ dropout=MODEL_CONFIG['DROPOUT']
30
+ ).to(MODEL_CONFIG['DEVICE'])
31
+ print("Model initialized.")
32
+
33
+ optimizer = optim.AdamW(model.parameters(), lr=MODEL_CONFIG['LEARNING_RATE'], eps=MODEL_CONFIG['ADAM_EPSILON'])
34
+ total_steps = len(dataloader) * MODEL_CONFIG['NUM_EPOCHS'] // MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']
35
+ scheduler = get_linear_schedule_with_warmup(
36
+ optimizer,
37
+ num_warmup_steps=MODEL_CONFIG['WARMUP_STEPS'],
38
+ num_training_steps=total_steps
39
+ )
40
+ print("Optimizer and scheduler set up.")
41
+
42
+ # Mixed precision setup
43
+ scaler = torch.cuda.amp.GradScaler()
44
+
45
+ model.train()
46
+ print("Starting training loop.")
47
+ for epoch in range(MODEL_CONFIG['NUM_EPOCHS']):
48
+ print(f"Epoch {epoch + 1}/{MODEL_CONFIG['NUM_EPOCHS']} started.")
49
+ for step, batch in enumerate(tqdm(dataloader, desc=f"Epoch {epoch + 1}")):
50
+ batch = batch.to(MODEL_CONFIG['DEVICE'])
51
+
52
+ # Mixed precision training
53
+ with torch.cuda.amp.autocast():
54
+ outputs = model(batch)
55
+ loss = F.cross_entropy(outputs.view(-1, outputs.size(-1)), batch.view(-1))
56
+
57
+ # Gradient accumulation
58
+ loss = loss / MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']
59
+ scaler.scale(loss).backward()
60
+
61
+ if (step + 1) % MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS'] == 0:
62
+ scaler.unscale_(optimizer)
63
+ torch.nn.utils.clip_grad_norm_(model.parameters(), MODEL_CONFIG['MAX_GRAD_NORM'])
64
+ scaler.step(optimizer)
65
+ scaler.update()
66
+ scheduler.step()
67
+ optimizer.zero_grad()
68
+
69
+ if step % TRAINING_CONFIG['LOGGING_STEPS'] == 0:
70
+ wandb.log({"loss": loss.item() * MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']})
71
+
72
+ if step % TRAINING_CONFIG['EVAL_STEPS'] == 0:
73
+ model.eval()
74
+ with torch.no_grad():
75
+ val_loss = sum(F.cross_entropy(model(batch).view(-1, outputs.size(-1)), batch.view(-1)).item() for batch in dataloader)
76
+ wandb.log({"val_loss": val_loss / len(dataloader)})
77
+ model.train()
78
+
79
+ if step % TRAINING_CONFIG['CHECKPOINT_SAVE_STEPS'] == 0:
80
+ torch.save(model.state_dict(), f"checkpoint_{epoch}_{step}.pt")
81
+
82
+ torch.save(model.state_dict(), "GEM_1o_Aug_15.pt")
83
+ print("Training complete. Final model saved.")
84
+
85
+ if __name__ == "__main__":
86
+ train()
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (145 Bytes). View file
 
utils/__pycache__/data_preprocessing.cpython-310.pyc ADDED
Binary file (1.93 kB). View file
 
utils/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (1.66 kB). View file
 
utils/data_preprocessing.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ from torch.utils.data import DataLoader, Dataset
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer
6
+
7
+ def train_tokenizer(texts, vocab_size=50000, min_frequency=2):
8
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
9
+ tokenizer = tokenizer.train_new_from_iterator(texts, vocab_size=vocab_size, min_frequency=min_frequency)
10
+ if tokenizer.pad_token is None:
11
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
12
+ tokenizer.save_pretrained("./tokenizer")
13
+ return tokenizer
14
+
15
+ def load_tokenizer():
16
+ tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
17
+ if tokenizer.pad_token is None:
18
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
19
+ return tokenizer
20
+
21
+ class TextDataset(Dataset):
22
+ def __init__(self, texts, tokenizer, max_length):
23
+ self.texts = texts
24
+ self.tokenizer = tokenizer
25
+ self.max_length = max_length
26
+
27
+ def __len__(self):
28
+ return len(self.texts)
29
+
30
+ def __getitem__(self, idx):
31
+ text = self.texts[idx]
32
+ encodings = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)
33
+ return torch.tensor(encodings['input_ids'])
34
+
35
+ def get_dataloader(dataset_name, config_name, tokenizer, max_length, batch_size):
36
+ dataset = load_dataset(dataset_name, config_name)
37
+ texts = dataset['train']['text'][:50] #delete [:500 for actual training set w/ full voxabsize]
38
+ dataset = TextDataset(texts, tokenizer, max_length)
39
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
40
+ return dataloader
utils/text_generation.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+ def generate_text(model, tokenizer, prompt, max_length=50, device='cuda'):
5
+ model.eval()
6
+ input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
7
+
8
+ with torch.no_grad():
9
+ for _ in range(max_length):
10
+ outputs = model(input_ids)
11
+ next_token_logits = outputs[:, -1, :]
12
+ next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
13
+ input_ids = torch.cat([input_ids, next_token], dim=-1)
14
+
15
+ if next_token.item() == tokenizer.eos_token_id:
16
+ break
17
+
18
+ return tokenizer.decode(input_ids[0], skip_special_tokens=True)
utils/tokenizer.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/tokenizer.py
2
+
3
+ class CharTokenizer:
4
+ def __init__(self):
5
+ self.chars = set()
6
+ self.char2idx = {}
7
+ self.idx2char = {}
8
+
9
+ def fit(self, texts):
10
+ for text in texts:
11
+ self.chars.update(set(text))
12
+ self.chars = sorted(list(self.chars))
13
+ self.char2idx = {char: idx for idx, char in enumerate(self.chars)}
14
+ self.idx2char = {idx: char for char, idx in self.char2idx.items()}
15
+
16
+ def encode(self, text, max_length=None):
17
+ encoded = [self.char2idx[char] for char in text if char in self.char2idx]
18
+ if max_length:
19
+ encoded = encoded[:max_length] + [0] * (max_length - len(encoded))
20
+ return encoded
21
+
22
+ def decode(self, tokens):
23
+ return ''.join([self.idx2char[token] for token in tokens if token in self.idx2char])