zaibutcooler commited on
Commit
49ffc60
·
1 Parent(s): 951a31c

data prep bin success

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. training.py +2 -0
  3. yume/dataset.py +17 -11
  4. yume/utils.py +3 -1
.gitignore CHANGED
@@ -32,3 +32,5 @@ models/
32
  # Ignore system files
33
  .DS_Store
34
  Thumbs.db
 
 
 
32
  # Ignore system files
33
  .DS_Store
34
  Thumbs.db
35
+
36
+ dummy.py
training.py CHANGED
@@ -8,6 +8,8 @@ dataset.build_dataset()
8
 
9
  yume = Yume(config)
10
 
 
 
11
  # assert len(dataset.data) > 0
12
 
13
  # yume.pretrain(dataset)
 
8
 
9
  yume = Yume(config)
10
 
11
+
12
+
13
  # assert len(dataset.data) > 0
14
 
15
  # yume.pretrain(dataset)
yume/dataset.py CHANGED
@@ -8,7 +8,7 @@ from .utils import dummy_logger
8
  import tiktoken
9
 
10
  class Trainset(Dataset):
11
- def __init__(self, batch_size=48, dataset_url="zaibutcooler/wiki-japanese"):
12
  self.batch_size = batch_size
13
  self.dataset_url = dataset_url
14
  self.tokenizer = None
@@ -17,23 +17,24 @@ class Trainset(Dataset):
17
 
18
  def _load_dataset(self):
19
  loaded_dataset = load_dataset(self.dataset_url)
20
- self.texts = loaded_dataset["animanga"]["texts"]
21
  dummy_logger("Successfully loaded the dataset")
22
 
23
- def _tokenize(self, tiktoken=True):
24
- if tiktoken:
25
- enc = tiktoken.get_encoding("cl100k_base")
26
  self.tokenizer = enc
27
  else:
28
  self.tokenizer = Tokenizer()
29
  self.tokenizer.load_pretrained()
30
- self.texts = self.texts.map(lambda x: self.tokenizer.encode(x))
 
31
 
32
  def _prep_bin(self):
33
  # Split the dataset into training and validation sets
34
- train_size = int(0.99 * len(self.texts))
35
- val_size = len(self.texts) - train_size
36
- self.train_data, self.val_data = torch.utils.data.random_split(self.texts, [train_size, val_size])
37
 
38
  # Save the tokenized data to binary files
39
  self._save_to_bin(self.train_data, "train.bin")
@@ -41,12 +42,12 @@ class Trainset(Dataset):
41
 
42
  def _save_to_bin(self, data, filename):
43
  arr_len = np.sum([len(x) for x in data], dtype=np.uint64)
44
- dtype = np.uint16
45
  arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
46
 
47
  idx = 0
48
  for x in data:
49
- arr[idx:idx + len(x)] = x
50
  idx += len(x)
51
  arr.flush()
52
 
@@ -61,6 +62,11 @@ class Trainset(Dataset):
61
  return self.train_data[:batch_size]
62
 
63
  def build_dataset(self):
 
64
  self._load_dataset()
 
 
65
  self._tokenize()
 
 
66
  self._prep_bin()
 
8
  import tiktoken
9
 
10
  class Trainset(Dataset):
11
+ def __init__(self, batch_size=48, dataset_url="zaibutcooler/japanwiki-vault"):
12
  self.batch_size = batch_size
13
  self.dataset_url = dataset_url
14
  self.tokenizer = None
 
17
 
18
  def _load_dataset(self):
19
  loaded_dataset = load_dataset(self.dataset_url)
20
+ self.text = loaded_dataset["train"]["text"]
21
  dummy_logger("Successfully loaded the dataset")
22
 
23
+ def _tokenize(self, use_tiktoken=True):
24
+ if use_tiktoken:
25
+ enc = tiktoken.encoding_for_model("gpt-4")
26
  self.tokenizer = enc
27
  else:
28
  self.tokenizer = Tokenizer()
29
  self.tokenizer.load_pretrained()
30
+ self.text = torch.utils.data.Dataset(self.text).map(lambda x: self.tokenizer.encode(x))
31
+
32
 
33
  def _prep_bin(self):
34
  # Split the dataset into training and validation sets
35
+ train_size = int(0.99 * len(self.text))
36
+ val_size = len(self.text) - train_size
37
+ self.train_data, self.val_data = torch.utils.data.random_split(self.text, [train_size, val_size])
38
 
39
  # Save the tokenized data to binary files
40
  self._save_to_bin(self.train_data, "train.bin")
 
42
 
43
  def _save_to_bin(self, data, filename):
44
  arr_len = np.sum([len(x) for x in data], dtype=np.uint64)
45
+ dtype = np.object_ # Change dtype to np.object_
46
  arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
47
 
48
  idx = 0
49
  for x in data:
50
+ arr[idx:idx + len(x)] = [x] # Wrap x in a list to convert to object array
51
  idx += len(x)
52
  arr.flush()
53
 
 
62
  return self.train_data[:batch_size]
63
 
64
  def build_dataset(self):
65
+
66
  self._load_dataset()
67
+
68
+
69
  self._tokenize()
70
+ dummy_logger("Preparing the Bin")
71
+
72
  self._prep_bin()
yume/utils.py CHANGED
@@ -2,7 +2,9 @@ from .tokenizer import Tokenizer
2
 
3
 
4
  def dummy_logger(text):
5
- pass
 
 
6
 
7
 
8
  def training_logger(text):
 
2
 
3
 
4
  def dummy_logger(text):
5
+ print("###################################")
6
+ print(f"{text}")
7
+ print("###################################")
8
 
9
 
10
  def training_logger(text):