Zai commited on
Commit
50310f8
1 Parent(s): 36cbecb

to test dataset loading

Browse files
.github/workflows/hugging-face.yaml CHANGED
@@ -12,8 +12,33 @@ jobs:
12
  with:
13
  fetch-depth: 0
14
  lfs: true
15
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  - name: Push to hub
17
  env:
18
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
- run: git push https://zaibutcooler:$HF_TOKEN@huggingface.co/spaces/zaibutcooler/yume main
 
 
12
  with:
13
  fetch-depth: 0
14
  lfs: true
15
+ - name: Set Git identity
16
+ run: |
17
+ git config --global user.email "github-actions-bot@github.com"
18
+ git config --global user.name "GitHub Actions"
19
+
20
+ - name: Update README.md
21
+ run: |
22
+ tmp_file=$(mktemp)
23
+ echo "---" >> $tmp_file
24
+ echo "title: Yume" >> $tmp_file
25
+ echo "emoji: ✨" >> $tmp_file
26
+ echo "colorFrom: green" >> $tmp_file
27
+ echo "colorTo: blue" >> $tmp_file
28
+ echo "sdk: streamlit" >> $tmp_file
29
+ echo "sdk_version: 1.29.0" >> $tmp_file
30
+ echo "app_file: interface.py" >> $tmp_file
31
+ echo "pinned: false" >> $tmp_file
32
+ echo "license: openrail" >> $tmp_file
33
+ echo "---" >> $tmp_file
34
+ echo "" >> $tmp_file
35
+ cat README.md >> $tmp_file
36
+ mv $tmp_file README.md
37
+ git add README.md
38
+ git commit -m "Updated README.md"
39
+
40
  - name: Push to hub
41
  env:
42
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
43
+ run: |
44
+ git push https://zaibutcooler:$HF_TOKEN@huggingface.co/spaces/zaibutcooler/yume --force main
sampling.py CHANGED
@@ -1,4 +1,4 @@
1
- from .yume import Yume,Config
2
 
3
  config = Config()
4
 
 
1
+ from .yume import Yume, Config
2
 
3
  config = Config()
4
 
tests/test_datasets.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from yume.dataset import Trainset
3
 
4
 
5
  class TestDatasets(unittest.TestCase):
@@ -19,5 +19,6 @@ class TestDatasets(unittest.TestCase):
19
  encoded_text = trainset.tokenizer.encode(dummy_text)
20
  assert trainset.tokenizer.decode(encoded_text) == dummy_text
21
 
 
22
  if __name__ == "__main__":
23
  unittest.main()
 
1
  import unittest
2
+ from yume.dataset import Trainset
3
 
4
 
5
  class TestDatasets(unittest.TestCase):
 
19
  encoded_text = trainset.tokenizer.encode(dummy_text)
20
  assert trainset.tokenizer.decode(encoded_text) == dummy_text
21
 
22
+
23
  if __name__ == "__main__":
24
  unittest.main()
tests/test_pretrained.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from yume import Yume,Config
3
 
4
 
5
  class TestPretrained(unittest.TestCase):
@@ -7,7 +7,7 @@ class TestPretrained(unittest.TestCase):
7
  super().__init__(methodName)
8
  self.config = Config()
9
  self.yume = Yume(config=self.config)
10
-
11
  def test_download(self):
12
  self.yume.load_pretrained()
13
  pass
 
1
  import unittest
2
+ from yume import Yume, Config
3
 
4
 
5
  class TestPretrained(unittest.TestCase):
 
7
  super().__init__(methodName)
8
  self.config = Config()
9
  self.yume = Yume(config=self.config)
10
+
11
  def test_download(self):
12
  self.yume.load_pretrained()
13
  pass
tests/test_tokenizer.py CHANGED
@@ -1,12 +1,13 @@
1
  import unittest
2
  from yume import Tokenizer
3
 
 
4
  class TestTokenizer(unittest.TestCase):
5
  def __init__(self, methodName: str = "runTest") -> None:
6
  super().__init__(methodName)
7
  self.tokenizer = Tokenizer()
8
  self.dummy_text = "馬鹿なこと言わないでよ"
9
-
10
  def test_encode(self):
11
  pass
12
 
 
1
  import unittest
2
  from yume import Tokenizer
3
 
4
+
5
  class TestTokenizer(unittest.TestCase):
6
  def __init__(self, methodName: str = "runTest") -> None:
7
  super().__init__(methodName)
8
  self.tokenizer = Tokenizer()
9
  self.dummy_text = "馬鹿なこと言わないでよ"
10
+
11
  def test_encode(self):
12
  pass
13
 
training.py CHANGED
@@ -1,4 +1,4 @@
1
- from .yume import Yume,Trainset,Config
2
 
3
  config = Config()
4
 
@@ -16,6 +16,6 @@ yume.pretrain(dataset.data)
16
 
17
  yume.sample()
18
 
19
- #optional
20
  # yume.huggingface_login("your hf tokens")
21
- # yume.save_pretrained("yume")
 
1
+ from .yume import Yume, Trainset, Config
2
 
3
  config = Config()
4
 
 
16
 
17
  yume.sample()
18
 
19
+ # optional
20
  # yume.huggingface_login("your hf tokens")
21
+ # yume.save_pretrained("yume")
yume/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
  from .yume import Yume
2
  from .dataset import Trainset
3
- from.tokenizer import Tokenizer
4
  from .config import Config
 
1
  from .yume import Yume
2
  from .dataset import Trainset
3
+ from .tokenizer import Tokenizer
4
  from .config import Config
yume/config.py CHANGED
@@ -9,7 +9,7 @@ class Config:
9
  n_embd=768,
10
  dropout=0.0,
11
  bias=True,
12
- lr=0.001
13
  ) -> None:
14
  self.num_epoch = num_epoch
15
  self.block_sized = 1024
 
9
  n_embd=768,
10
  dropout=0.0,
11
  bias=True,
12
+ lr=0.001,
13
  ) -> None:
14
  self.num_epoch = num_epoch
15
  self.block_sized = 1024
yume/dataset.py CHANGED
@@ -14,19 +14,17 @@ class Trainset(Dataset):
14
 
15
  def __len__(self):
16
  return len(self.data)
17
-
18
  def __getitem__(self, index):
19
  assert len(self.data) > 10
20
  return []
21
 
22
-
23
- def _load_dataset(self,url="zaibutcooler/animanga-vault"):
24
  loaded_dataset = load_dataset(url)
25
- self.texts = self.loaded_data["train"]["raw"]
26
- self.data = self.loaded_data["train"]["data"]
27
  dummy_logger("Successfully loaded the dataset")
28
-
29
- def _tokenize(self,tiktoken=True):
30
  if tiktoken:
31
  enc = tiktoken.get_encoding("cl100k_base")
32
  assert enc.decode(enc.encode("hello world")) == "hello world"
@@ -36,4 +34,4 @@ class Trainset(Dataset):
36
  else:
37
  self.tokenizer = Tokenizer()
38
  self.tokenizer.load_pretrained()
39
-
 
14
 
15
  def __len__(self):
16
  return len(self.data)
17
+
18
  def __getitem__(self, index):
19
  assert len(self.data) > 10
20
  return []
21
 
22
+ def _load_dataset(self, url="zaibutcooler/animanga-vault"):
 
23
  loaded_dataset = load_dataset(url)
24
+ self.texts = loaded_dataset["animanga"]["texts"]
 
25
  dummy_logger("Successfully loaded the dataset")
26
+
27
+ def _tokenize(self, tiktoken=True):
28
  if tiktoken:
29
  enc = tiktoken.get_encoding("cl100k_base")
30
  assert enc.decode(enc.encode("hello world")) == "hello world"
 
34
  else:
35
  self.tokenizer = Tokenizer()
36
  self.tokenizer.load_pretrained()
37
+ self.tokenizer.encode(self.texts)
yume/yume.py CHANGED
@@ -18,17 +18,16 @@ class Yume:
18
 
19
  def generate(self):
20
  pass
21
-
22
  def sample(self):
23
  pass
24
 
25
- def pretrain(self,tokens):
26
  lr = self.config.lr
27
  num_epochs = self.config.num_epoch
28
-
29
-
30
  pass
31
-
32
  def fine_tune(self):
33
  pass
34
 
@@ -38,7 +37,7 @@ class Yume:
38
  n_params -= self.transformer.wpe.weight.numel()
39
  dummy_logger(f"parameter count -> {n_params}")
40
  return n_params
41
-
42
  def save_pretrained(self, name="yume"):
43
  self.model.save_pretrained(name)
44
  self.model.push_to_hub(name)
@@ -51,4 +50,4 @@ class Yume:
51
  def huggingface_login(self, token):
52
  assert token is not None
53
  login(token=token)
54
- dummy_logger("Logged in successfully")
 
18
 
19
  def generate(self):
20
  pass
21
+
22
  def sample(self):
23
  pass
24
 
25
+ def pretrain(self, tokens):
26
  lr = self.config.lr
27
  num_epochs = self.config.num_epoch
28
+
 
29
  pass
30
+
31
  def fine_tune(self):
32
  pass
33
 
 
37
  n_params -= self.transformer.wpe.weight.numel()
38
  dummy_logger(f"parameter count -> {n_params}")
39
  return n_params
40
+
41
  def save_pretrained(self, name="yume"):
42
  self.model.save_pretrained(name)
43
  self.model.push_to_hub(name)
 
50
  def huggingface_login(self, token):
51
  assert token is not None
52
  login(token=token)
53
+ dummy_logger("Logged in successfully")