Spaces:

samwell
/

SamGPT

Runtime error

App Files Files Community

samwell commited on Oct 2

Commit

efeb0bd

•

1 Parent(s): a4372e8

Create supplementary.py

Browse files

Files changed (1) hide show

supplementary.py +52 -0

supplementary.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
+# Source for "Build a Large Language Model From Scratch"
+#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
+# Code: https://github.com/rasbt/LLMs-from-scratch
+import torch
+import tiktoken
+from torch.utils.data import Dataset, DataLoader
+class GPTDatasetV1(Dataset):
+    def __init__(self, txt, tokenizer, max_length, stride):
+        self.input_ids = []
+        self.target_ids = []
+        # Tokenize the entire text
+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
+        # Use a sliding window to chunk the book into overlapping sequences of max_length
+        for i in range(0, len(token_ids) - max_length, stride):
+            input_chunk = token_ids[i:i + max_length]
+            target_chunk = token_ids[i + 1: i + max_length + 1]
+            self.input_ids.append(torch.tensor(input_chunk))
+            self.target_ids.append(torch.tensor(target_chunk))
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, idx):
+        return self.input_ids[idx], self.target_ids[idx]
+def create_dataloader_v1(txt, batch_size=4, max_length=256,
+                         stride=128, shuffle=True, drop_last=True,
+                         num_workers=0):
+    # Initialize the tokenizer
+    tokenizer = tiktoken.get_encoding("gpt2")
+    # Create dataset
+    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
+    # Create dataloader
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        drop_last=drop_last,
+        num_workers=num_workers
+    )
+    return dataloader