Commit #1: GEM_1o_Aug trained
Browse files- GEM_1o_Aug_15.pt +3 -0
- Testings/testing.py +63 -0
- configs/__pycache__/config.cpython-310.pyc +0 -0
- configs/config.py +28 -0
- generate.py +37 -0
- models/__init__.py +0 -0
- models/__pycache__/__init__.cpython-310.pyc +0 -0
- models/__pycache__/gem_model.cpython-310.pyc +0 -0
- models/gem_model.py +55 -0
- requirements.txt +7 -0
- tokenizer-merges.txt +0 -0
- tokenizer-vocab.json +0 -0
- tokenizer/gem_tokenizer.json +0 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +5 -0
- tokenizer/tokenizer.json +0 -0
- tokenizer/tokenizer_config.json +19 -0
- tokenizer/vocab.json +0 -0
- train.py +86 -0
- utils/__init__.py +0 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/data_preprocessing.cpython-310.pyc +0 -0
- utils/__pycache__/tokenizer.cpython-310.pyc +0 -0
- utils/data_preprocessing.py +40 -0
- utils/text_generation.py +18 -0
- utils/tokenizer.py +23 -0
GEM_1o_Aug_15.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b753b1132c3c9fdc9bbdedd57d1f964d1c45534860b6416192fcc00a832df86
|
3 |
+
size 2024369114
|
Testings/testing.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
import sys
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Add the parent directory of the model folder to the system path
|
7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../models')))
|
8 |
+
|
9 |
+
from gem_model import GEM
|
10 |
+
|
11 |
+
# Configuration parameters for GEM
|
12 |
+
vocab_size = 50001 # Example vocab size, adjust if necessary
|
13 |
+
d_model = 1024 # Dimension of the model
|
14 |
+
n_heads = 32 # Number of attention heads
|
15 |
+
d_ff = 4096 # Dimension of the feedforward network
|
16 |
+
n_layers = 32 # Number of transformer layers
|
17 |
+
dropout = 0.1 # Dropout rate
|
18 |
+
|
19 |
+
# Initialize the model
|
20 |
+
model = GEM(vocab_size, d_model, n_heads, d_ff, n_layers, dropout)
|
21 |
+
|
22 |
+
# Load pre-trained weights
|
23 |
+
model_path = '/content/drive/MyDrive/GEM_Project/GEM_1o_Aug_15.pt'
|
24 |
+
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
|
25 |
+
|
26 |
+
# Set the model to evaluation mode
|
27 |
+
model.eval()
|
28 |
+
|
29 |
+
# Define a function to convert text to token IDs (example)
|
30 |
+
def text_to_ids(tokenizer, text):
|
31 |
+
# Implement this function based on your tokenizer's method
|
32 |
+
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
33 |
+
|
34 |
+
# Define a tokenizer or token conversion function (example placeholder)
|
35 |
+
class DummyTokenizer:
|
36 |
+
def tokenize(self, text):
|
37 |
+
# Simple tokenization example, replace with actual tokenizer
|
38 |
+
return text.split()
|
39 |
+
|
40 |
+
def convert_tokens_to_ids(self, tokens):
|
41 |
+
# Simple mapping example, replace with actual ID mapping
|
42 |
+
return [ord(token[0]) % 50000 for token in tokens]
|
43 |
+
|
44 |
+
# Initialize tokenizer
|
45 |
+
tokenizer = DummyTokenizer()
|
46 |
+
|
47 |
+
# Test input
|
48 |
+
test_prompt = "This is a test."
|
49 |
+
test_input_ids = torch.tensor(text_to_ids(tokenizer, test_prompt), dtype=torch.long).unsqueeze(0) # Add batch dimension
|
50 |
+
attention_mask = torch.ones(test_input_ids.shape, dtype=torch.bool)
|
51 |
+
|
52 |
+
# Perform a forward pass
|
53 |
+
with torch.no_grad():
|
54 |
+
outputs = model(test_input_ids, attention_mask)
|
55 |
+
print("Model outputs:")
|
56 |
+
print(outputs)
|
57 |
+
|
58 |
+
# Test the generate method
|
59 |
+
generation_prompt = "Once upon a time"
|
60 |
+
input_ids = torch.tensor(text_to_ids(tokenizer, generation_prompt), dtype=torch.long).unsqueeze(0) # Add batch dimension
|
61 |
+
generated_ids = model.generate(input_ids, max_length=10, temperature=1.0)
|
62 |
+
print("Generated IDs:")
|
63 |
+
print(generated_ids)
|
configs/__pycache__/config.cpython-310.pyc
ADDED
Binary file (803 Bytes). View file
|
|
configs/config.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
|
4 |
+
MODEL_CONFIG = {
|
5 |
+
'VOCAB_SIZE': 50000,
|
6 |
+
'D_MODEL': 1024,
|
7 |
+
'N_HEADS': 32,
|
8 |
+
'D_FF': 4096,
|
9 |
+
'N_LAYERS': 32,
|
10 |
+
'MAX_SEQ_LEN': 512,
|
11 |
+
'BATCH_SIZE': 32,
|
12 |
+
'LEARNING_RATE': 1e-4,
|
13 |
+
'NUM_EPOCHS': 20,
|
14 |
+
'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
|
15 |
+
'WARMUP_STEPS': 4000,
|
16 |
+
'ADAM_EPSILON': 1e-8,
|
17 |
+
'WEIGHT_DECAY': 0.01,
|
18 |
+
'GRADIENT_ACCUMULATION_STEPS': 2,
|
19 |
+
'MAX_GRAD_NORM': 1.0,
|
20 |
+
'DROPOUT': 0.1,
|
21 |
+
}
|
22 |
+
|
23 |
+
TRAINING_CONFIG = {
|
24 |
+
'CHECKPOINT_SAVE_STEPS': 5000,
|
25 |
+
'LOGGING_STEPS': 100,
|
26 |
+
'EVAL_STEPS': 1000,
|
27 |
+
'SAVE_TOTAL_LIMIT': 5
|
28 |
+
}
|
generate.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
from models.gem_model import GEM
|
4 |
+
from utils.data_preprocessing import load_tokenizer
|
5 |
+
from configs.config import MODEL_CONFIG
|
6 |
+
|
7 |
+
def generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7):
|
8 |
+
device = torch.device(MODEL_CONFIG['DEVICE'])
|
9 |
+
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
|
10 |
+
generated = model.generate(input_ids, max_length=max_length, temperature=temperature)
|
11 |
+
return tokenizer.decode(generated[0], skip_special_tokens=True)
|
12 |
+
|
13 |
+
def main():
|
14 |
+
device = torch.device(MODEL_CONFIG['DEVICE'])
|
15 |
+
|
16 |
+
tokenizer = load_tokenizer()
|
17 |
+
|
18 |
+
model = GEM(
|
19 |
+
vocab_size=MODEL_CONFIG['VOCAB_SIZE'],
|
20 |
+
d_model=MODEL_CONFIG['D_MODEL'],
|
21 |
+
n_heads=MODEL_CONFIG['N_HEADS'],
|
22 |
+
d_ff=MODEL_CONFIG['D_FF'],
|
23 |
+
n_layers=MODEL_CONFIG['N_LAYERS'],
|
24 |
+
max_seq_len=MODEL_CONFIG['MAX_SEQ_LEN'],
|
25 |
+
dropout=MODEL_CONFIG['DROPOUT']
|
26 |
+
).to(device)
|
27 |
+
|
28 |
+
checkpoint = torch.load('final_model/model.pt')
|
29 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
30 |
+
model.eval()
|
31 |
+
|
32 |
+
prompt = "Once upon a time"
|
33 |
+
generated_text = generate_text(model, tokenizer, prompt, max_length=100)
|
34 |
+
print(f"Generated text:\n{generated_text}")
|
35 |
+
|
36 |
+
if __name__ == "__main__":
|
37 |
+
main()
|
models/__init__.py
ADDED
File without changes
|
models/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (146 Bytes). View file
|
|
models/__pycache__/gem_model.cpython-310.pyc
ADDED
Binary file (2.59 kB). View file
|
|
models/gem_model.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import math
|
5 |
+
|
6 |
+
class PositionalEncoding(nn.Module):
|
7 |
+
def __init__(self, d_model, max_len=512, dropout=0.1):
|
8 |
+
super(PositionalEncoding, self).__init__()
|
9 |
+
self.dropout = nn.Dropout(p=dropout)
|
10 |
+
|
11 |
+
position = torch.arange(0, max_len).unsqueeze(1)
|
12 |
+
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
|
13 |
+
pe = torch.zeros(max_len, 1, d_model)
|
14 |
+
pe[:, 0, 0::2] = torch.sin(position * div_term)
|
15 |
+
pe[:, 0, 1::2] = torch.cos(position * div_term)
|
16 |
+
self.register_buffer('pe', pe)
|
17 |
+
|
18 |
+
def forward(self, x):
|
19 |
+
x = x + self.pe[:x.size(0), :]
|
20 |
+
return self.dropout(x)
|
21 |
+
|
22 |
+
class GEM(nn.Module):
|
23 |
+
def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, dropout=0.1):
|
24 |
+
super(GEM, self).__init__()
|
25 |
+
self.embedding = nn.Embedding(vocab_size, d_model)
|
26 |
+
self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
|
27 |
+
encoder_layers = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True)
|
28 |
+
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
|
29 |
+
self.fc_out = nn.Linear(d_model, vocab_size)
|
30 |
+
self.d_model = d_model
|
31 |
+
|
32 |
+
def forward(self, input_ids, attention_mask=None):
|
33 |
+
x = self.embedding(input_ids) * math.sqrt(self.d_model)
|
34 |
+
x = self.positional_encoding(x)
|
35 |
+
|
36 |
+
if attention_mask is not None:
|
37 |
+
# Ensure attention_mask is in the shape (batch_size, sequence_length)
|
38 |
+
# Convert attention_mask to (batch_size, sequence_length) format
|
39 |
+
attention_mask = attention_mask.bool() # Ensure it's a boolean tensor
|
40 |
+
x = self.transformer_encoder(x, src_key_padding_mask=attention_mask)
|
41 |
+
else:
|
42 |
+
x = self.transformer_encoder(x)
|
43 |
+
|
44 |
+
x = self.fc_out(x)
|
45 |
+
return x
|
46 |
+
|
47 |
+
def generate(self, input_ids, max_length, temperature=1.0):
|
48 |
+
self.eval()
|
49 |
+
with torch.no_grad():
|
50 |
+
for _ in range(max_length - input_ids.size(1)):
|
51 |
+
outputs = self(input_ids)
|
52 |
+
next_token_logits = outputs[:, -1, :] / temperature
|
53 |
+
next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
|
54 |
+
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
55 |
+
return input_ids
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
datasets
|
4 |
+
tensorboard
|
5 |
+
tokenizers
|
6 |
+
tqdm
|
7 |
+
wandb
|
tokenizer-merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer-vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/gem_tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
5 |
+
}
|
tokenizer/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
+
"clean_up_tokenization_spaces": true,
|
15 |
+
"eos_token": "<|endoftext|>",
|
16 |
+
"model_max_length": 1024,
|
17 |
+
"tokenizer_class": "GPT2Tokenizer",
|
18 |
+
"unk_token": "<|endoftext|>"
|
19 |
+
}
|
tokenizer/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
import torch.optim as optim
|
4 |
+
from torch.nn import functional as F
|
5 |
+
from torch.utils.data import DataLoader
|
6 |
+
from tqdm import tqdm
|
7 |
+
import wandb
|
8 |
+
from transformers import get_linear_schedule_with_warmup
|
9 |
+
from utils.data_preprocessing import get_dataloader, load_tokenizer
|
10 |
+
from models.gem_model import GEM
|
11 |
+
from configs.config import MODEL_CONFIG, TRAINING_CONFIG
|
12 |
+
|
13 |
+
def train():
|
14 |
+
wandb.init(project="GEM_Project", config=MODEL_CONFIG, mode="offline")
|
15 |
+
print("WandB initialized in offline mode.")
|
16 |
+
|
17 |
+
tokenizer = load_tokenizer()
|
18 |
+
print("Tokenizer loaded.")
|
19 |
+
|
20 |
+
dataloader = get_dataloader('wikitext', 'wikitext-2-raw-v1', tokenizer, MODEL_CONFIG['MAX_SEQ_LEN'], MODEL_CONFIG['BATCH_SIZE'])
|
21 |
+
print("Dataloader created.")
|
22 |
+
|
23 |
+
model = GEM(
|
24 |
+
vocab_size=len(tokenizer),
|
25 |
+
d_model=MODEL_CONFIG['D_MODEL'],
|
26 |
+
n_heads=MODEL_CONFIG['N_HEADS'],
|
27 |
+
d_ff=MODEL_CONFIG['D_FF'],
|
28 |
+
n_layers=MODEL_CONFIG['N_LAYERS'],
|
29 |
+
dropout=MODEL_CONFIG['DROPOUT']
|
30 |
+
).to(MODEL_CONFIG['DEVICE'])
|
31 |
+
print("Model initialized.")
|
32 |
+
|
33 |
+
optimizer = optim.AdamW(model.parameters(), lr=MODEL_CONFIG['LEARNING_RATE'], eps=MODEL_CONFIG['ADAM_EPSILON'])
|
34 |
+
total_steps = len(dataloader) * MODEL_CONFIG['NUM_EPOCHS'] // MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']
|
35 |
+
scheduler = get_linear_schedule_with_warmup(
|
36 |
+
optimizer,
|
37 |
+
num_warmup_steps=MODEL_CONFIG['WARMUP_STEPS'],
|
38 |
+
num_training_steps=total_steps
|
39 |
+
)
|
40 |
+
print("Optimizer and scheduler set up.")
|
41 |
+
|
42 |
+
# Mixed precision setup
|
43 |
+
scaler = torch.cuda.amp.GradScaler()
|
44 |
+
|
45 |
+
model.train()
|
46 |
+
print("Starting training loop.")
|
47 |
+
for epoch in range(MODEL_CONFIG['NUM_EPOCHS']):
|
48 |
+
print(f"Epoch {epoch + 1}/{MODEL_CONFIG['NUM_EPOCHS']} started.")
|
49 |
+
for step, batch in enumerate(tqdm(dataloader, desc=f"Epoch {epoch + 1}")):
|
50 |
+
batch = batch.to(MODEL_CONFIG['DEVICE'])
|
51 |
+
|
52 |
+
# Mixed precision training
|
53 |
+
with torch.cuda.amp.autocast():
|
54 |
+
outputs = model(batch)
|
55 |
+
loss = F.cross_entropy(outputs.view(-1, outputs.size(-1)), batch.view(-1))
|
56 |
+
|
57 |
+
# Gradient accumulation
|
58 |
+
loss = loss / MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']
|
59 |
+
scaler.scale(loss).backward()
|
60 |
+
|
61 |
+
if (step + 1) % MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS'] == 0:
|
62 |
+
scaler.unscale_(optimizer)
|
63 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), MODEL_CONFIG['MAX_GRAD_NORM'])
|
64 |
+
scaler.step(optimizer)
|
65 |
+
scaler.update()
|
66 |
+
scheduler.step()
|
67 |
+
optimizer.zero_grad()
|
68 |
+
|
69 |
+
if step % TRAINING_CONFIG['LOGGING_STEPS'] == 0:
|
70 |
+
wandb.log({"loss": loss.item() * MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']})
|
71 |
+
|
72 |
+
if step % TRAINING_CONFIG['EVAL_STEPS'] == 0:
|
73 |
+
model.eval()
|
74 |
+
with torch.no_grad():
|
75 |
+
val_loss = sum(F.cross_entropy(model(batch).view(-1, outputs.size(-1)), batch.view(-1)).item() for batch in dataloader)
|
76 |
+
wandb.log({"val_loss": val_loss / len(dataloader)})
|
77 |
+
model.train()
|
78 |
+
|
79 |
+
if step % TRAINING_CONFIG['CHECKPOINT_SAVE_STEPS'] == 0:
|
80 |
+
torch.save(model.state_dict(), f"checkpoint_{epoch}_{step}.pt")
|
81 |
+
|
82 |
+
torch.save(model.state_dict(), "GEM_1o_Aug_15.pt")
|
83 |
+
print("Training complete. Final model saved.")
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
train()
|
utils/__init__.py
ADDED
File without changes
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (145 Bytes). View file
|
|
utils/__pycache__/data_preprocessing.cpython-310.pyc
ADDED
Binary file (1.93 kB). View file
|
|
utils/__pycache__/tokenizer.cpython-310.pyc
ADDED
Binary file (1.66 kB). View file
|
|
utils/data_preprocessing.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
from torch.utils.data import DataLoader, Dataset
|
4 |
+
from datasets import load_dataset
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
def train_tokenizer(texts, vocab_size=50000, min_frequency=2):
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
9 |
+
tokenizer = tokenizer.train_new_from_iterator(texts, vocab_size=vocab_size, min_frequency=min_frequency)
|
10 |
+
if tokenizer.pad_token is None:
|
11 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
12 |
+
tokenizer.save_pretrained("./tokenizer")
|
13 |
+
return tokenizer
|
14 |
+
|
15 |
+
def load_tokenizer():
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
|
17 |
+
if tokenizer.pad_token is None:
|
18 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
19 |
+
return tokenizer
|
20 |
+
|
21 |
+
class TextDataset(Dataset):
|
22 |
+
def __init__(self, texts, tokenizer, max_length):
|
23 |
+
self.texts = texts
|
24 |
+
self.tokenizer = tokenizer
|
25 |
+
self.max_length = max_length
|
26 |
+
|
27 |
+
def __len__(self):
|
28 |
+
return len(self.texts)
|
29 |
+
|
30 |
+
def __getitem__(self, idx):
|
31 |
+
text = self.texts[idx]
|
32 |
+
encodings = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)
|
33 |
+
return torch.tensor(encodings['input_ids'])
|
34 |
+
|
35 |
+
def get_dataloader(dataset_name, config_name, tokenizer, max_length, batch_size):
|
36 |
+
dataset = load_dataset(dataset_name, config_name)
|
37 |
+
texts = dataset['train']['text'][:50] #delete [:500 for actual training set w/ full voxabsize]
|
38 |
+
dataset = TextDataset(texts, tokenizer, max_length)
|
39 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
40 |
+
return dataloader
|
utils/text_generation.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
|
4 |
+
def generate_text(model, tokenizer, prompt, max_length=50, device='cuda'):
|
5 |
+
model.eval()
|
6 |
+
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
|
7 |
+
|
8 |
+
with torch.no_grad():
|
9 |
+
for _ in range(max_length):
|
10 |
+
outputs = model(input_ids)
|
11 |
+
next_token_logits = outputs[:, -1, :]
|
12 |
+
next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
|
13 |
+
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
14 |
+
|
15 |
+
if next_token.item() == tokenizer.eos_token_id:
|
16 |
+
break
|
17 |
+
|
18 |
+
return tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
utils/tokenizer.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# utils/tokenizer.py
|
2 |
+
|
3 |
+
class CharTokenizer:
|
4 |
+
def __init__(self):
|
5 |
+
self.chars = set()
|
6 |
+
self.char2idx = {}
|
7 |
+
self.idx2char = {}
|
8 |
+
|
9 |
+
def fit(self, texts):
|
10 |
+
for text in texts:
|
11 |
+
self.chars.update(set(text))
|
12 |
+
self.chars = sorted(list(self.chars))
|
13 |
+
self.char2idx = {char: idx for idx, char in enumerate(self.chars)}
|
14 |
+
self.idx2char = {idx: char for char, idx in self.char2idx.items()}
|
15 |
+
|
16 |
+
def encode(self, text, max_length=None):
|
17 |
+
encoded = [self.char2idx[char] for char in text if char in self.char2idx]
|
18 |
+
if max_length:
|
19 |
+
encoded = encoded[:max_length] + [0] * (max_length - len(encoded))
|
20 |
+
return encoded
|
21 |
+
|
22 |
+
def decode(self, tokens):
|
23 |
+
return ''.join([self.idx2char[token] for token in tokens if token in self.idx2char])
|