Spaces:
Sleeping
Sleeping
Commit
·
49ffc60
1
Parent(s):
951a31c
data prep bin success
Browse files- .gitignore +2 -0
- training.py +2 -0
- yume/dataset.py +17 -11
- yume/utils.py +3 -1
.gitignore
CHANGED
@@ -32,3 +32,5 @@ models/
|
|
32 |
# Ignore system files
|
33 |
.DS_Store
|
34 |
Thumbs.db
|
|
|
|
|
|
32 |
# Ignore system files
|
33 |
.DS_Store
|
34 |
Thumbs.db
|
35 |
+
|
36 |
+
dummy.py
|
training.py
CHANGED
@@ -8,6 +8,8 @@ dataset.build_dataset()
|
|
8 |
|
9 |
yume = Yume(config)
|
10 |
|
|
|
|
|
11 |
# assert len(dataset.data) > 0
|
12 |
|
13 |
# yume.pretrain(dataset)
|
|
|
8 |
|
9 |
yume = Yume(config)
|
10 |
|
11 |
+
|
12 |
+
|
13 |
# assert len(dataset.data) > 0
|
14 |
|
15 |
# yume.pretrain(dataset)
|
yume/dataset.py
CHANGED
@@ -8,7 +8,7 @@ from .utils import dummy_logger
|
|
8 |
import tiktoken
|
9 |
|
10 |
class Trainset(Dataset):
|
11 |
-
def __init__(self, batch_size=48, dataset_url="zaibutcooler/
|
12 |
self.batch_size = batch_size
|
13 |
self.dataset_url = dataset_url
|
14 |
self.tokenizer = None
|
@@ -17,23 +17,24 @@ class Trainset(Dataset):
|
|
17 |
|
18 |
def _load_dataset(self):
|
19 |
loaded_dataset = load_dataset(self.dataset_url)
|
20 |
-
self.
|
21 |
dummy_logger("Successfully loaded the dataset")
|
22 |
|
23 |
-
def _tokenize(self,
|
24 |
-
if
|
25 |
-
enc = tiktoken.
|
26 |
self.tokenizer = enc
|
27 |
else:
|
28 |
self.tokenizer = Tokenizer()
|
29 |
self.tokenizer.load_pretrained()
|
30 |
-
|
|
|
31 |
|
32 |
def _prep_bin(self):
|
33 |
# Split the dataset into training and validation sets
|
34 |
-
train_size = int(0.99 * len(self.
|
35 |
-
val_size = len(self.
|
36 |
-
self.train_data, self.val_data = torch.utils.data.random_split(self.
|
37 |
|
38 |
# Save the tokenized data to binary files
|
39 |
self._save_to_bin(self.train_data, "train.bin")
|
@@ -41,12 +42,12 @@ class Trainset(Dataset):
|
|
41 |
|
42 |
def _save_to_bin(self, data, filename):
|
43 |
arr_len = np.sum([len(x) for x in data], dtype=np.uint64)
|
44 |
-
dtype = np.
|
45 |
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
46 |
|
47 |
idx = 0
|
48 |
for x in data:
|
49 |
-
arr[idx:idx + len(x)] = x
|
50 |
idx += len(x)
|
51 |
arr.flush()
|
52 |
|
@@ -61,6 +62,11 @@ class Trainset(Dataset):
|
|
61 |
return self.train_data[:batch_size]
|
62 |
|
63 |
def build_dataset(self):
|
|
|
64 |
self._load_dataset()
|
|
|
|
|
65 |
self._tokenize()
|
|
|
|
|
66 |
self._prep_bin()
|
|
|
8 |
import tiktoken
|
9 |
|
10 |
class Trainset(Dataset):
|
11 |
+
def __init__(self, batch_size=48, dataset_url="zaibutcooler/japanwiki-vault"):
|
12 |
self.batch_size = batch_size
|
13 |
self.dataset_url = dataset_url
|
14 |
self.tokenizer = None
|
|
|
17 |
|
18 |
def _load_dataset(self):
|
19 |
loaded_dataset = load_dataset(self.dataset_url)
|
20 |
+
self.text = loaded_dataset["train"]["text"]
|
21 |
dummy_logger("Successfully loaded the dataset")
|
22 |
|
23 |
+
def _tokenize(self, use_tiktoken=True):
|
24 |
+
if use_tiktoken:
|
25 |
+
enc = tiktoken.encoding_for_model("gpt-4")
|
26 |
self.tokenizer = enc
|
27 |
else:
|
28 |
self.tokenizer = Tokenizer()
|
29 |
self.tokenizer.load_pretrained()
|
30 |
+
self.text = torch.utils.data.Dataset(self.text).map(lambda x: self.tokenizer.encode(x))
|
31 |
+
|
32 |
|
33 |
def _prep_bin(self):
|
34 |
# Split the dataset into training and validation sets
|
35 |
+
train_size = int(0.99 * len(self.text))
|
36 |
+
val_size = len(self.text) - train_size
|
37 |
+
self.train_data, self.val_data = torch.utils.data.random_split(self.text, [train_size, val_size])
|
38 |
|
39 |
# Save the tokenized data to binary files
|
40 |
self._save_to_bin(self.train_data, "train.bin")
|
|
|
42 |
|
43 |
def _save_to_bin(self, data, filename):
|
44 |
arr_len = np.sum([len(x) for x in data], dtype=np.uint64)
|
45 |
+
dtype = np.object_ # Change dtype to np.object_
|
46 |
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
47 |
|
48 |
idx = 0
|
49 |
for x in data:
|
50 |
+
arr[idx:idx + len(x)] = [x] # Wrap x in a list to convert to object array
|
51 |
idx += len(x)
|
52 |
arr.flush()
|
53 |
|
|
|
62 |
return self.train_data[:batch_size]
|
63 |
|
64 |
def build_dataset(self):
|
65 |
+
|
66 |
self._load_dataset()
|
67 |
+
|
68 |
+
|
69 |
self._tokenize()
|
70 |
+
dummy_logger("Preparing the Bin")
|
71 |
+
|
72 |
self._prep_bin()
|
yume/utils.py
CHANGED
@@ -2,7 +2,9 @@ from .tokenizer import Tokenizer
|
|
2 |
|
3 |
|
4 |
def dummy_logger(text):
|
5 |
-
|
|
|
|
|
6 |
|
7 |
|
8 |
def training_logger(text):
|
|
|
2 |
|
3 |
|
4 |
def dummy_logger(text):
|
5 |
+
print("###################################")
|
6 |
+
print(f"{text}")
|
7 |
+
print("###################################")
|
8 |
|
9 |
|
10 |
def training_logger(text):
|