samwell commited on
Commit
efeb0bd
1 Parent(s): a4372e8

Create supplementary.py

Browse files
Files changed (1) hide show
  1. supplementary.py +52 -0
supplementary.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
2
+ # Source for "Build a Large Language Model From Scratch"
3
+ # - https://www.manning.com/books/build-a-large-language-model-from-scratch
4
+ # Code: https://github.com/rasbt/LLMs-from-scratch
5
+
6
+ import torch
7
+ import tiktoken
8
+ from torch.utils.data import Dataset, DataLoader
9
+
10
+
11
+ class GPTDatasetV1(Dataset):
12
+ def __init__(self, txt, tokenizer, max_length, stride):
13
+ self.input_ids = []
14
+ self.target_ids = []
15
+
16
+ # Tokenize the entire text
17
+ token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
18
+
19
+ # Use a sliding window to chunk the book into overlapping sequences of max_length
20
+ for i in range(0, len(token_ids) - max_length, stride):
21
+ input_chunk = token_ids[i:i + max_length]
22
+ target_chunk = token_ids[i + 1: i + max_length + 1]
23
+ self.input_ids.append(torch.tensor(input_chunk))
24
+ self.target_ids.append(torch.tensor(target_chunk))
25
+
26
+ def __len__(self):
27
+ return len(self.input_ids)
28
+
29
+ def __getitem__(self, idx):
30
+ return self.input_ids[idx], self.target_ids[idx]
31
+
32
+
33
+ def create_dataloader_v1(txt, batch_size=4, max_length=256,
34
+ stride=128, shuffle=True, drop_last=True,
35
+ num_workers=0):
36
+
37
+ # Initialize the tokenizer
38
+ tokenizer = tiktoken.get_encoding("gpt2")
39
+
40
+ # Create dataset
41
+ dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
42
+
43
+ # Create dataloader
44
+ dataloader = DataLoader(
45
+ dataset,
46
+ batch_size=batch_size,
47
+ shuffle=shuffle,
48
+ drop_last=drop_last,
49
+ num_workers=num_workers
50
+ )
51
+
52
+ return dataloader