Spaces:
Sleeping
Sleeping
Zai
commited on
Commit
•
50310f8
1
Parent(s):
36cbecb
to test dataset loading
Browse files- .github/workflows/hugging-face.yaml +27 -2
- sampling.py +1 -1
- tests/test_datasets.py +2 -1
- tests/test_pretrained.py +2 -2
- tests/test_tokenizer.py +2 -1
- training.py +3 -3
- yume/__init__.py +1 -1
- yume/config.py +1 -1
- yume/dataset.py +6 -8
- yume/yume.py +6 -7
.github/workflows/hugging-face.yaml
CHANGED
@@ -12,8 +12,33 @@ jobs:
|
|
12 |
with:
|
13 |
fetch-depth: 0
|
14 |
lfs: true
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
- name: Push to hub
|
17 |
env:
|
18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
19 |
-
run:
|
|
|
|
12 |
with:
|
13 |
fetch-depth: 0
|
14 |
lfs: true
|
15 |
+
- name: Set Git identity
|
16 |
+
run: |
|
17 |
+
git config --global user.email "github-actions-bot@github.com"
|
18 |
+
git config --global user.name "GitHub Actions"
|
19 |
+
|
20 |
+
- name: Update README.md
|
21 |
+
run: |
|
22 |
+
tmp_file=$(mktemp)
|
23 |
+
echo "---" >> $tmp_file
|
24 |
+
echo "title: Yume" >> $tmp_file
|
25 |
+
echo "emoji: ✨" >> $tmp_file
|
26 |
+
echo "colorFrom: green" >> $tmp_file
|
27 |
+
echo "colorTo: blue" >> $tmp_file
|
28 |
+
echo "sdk: streamlit" >> $tmp_file
|
29 |
+
echo "sdk_version: 1.29.0" >> $tmp_file
|
30 |
+
echo "app_file: interface.py" >> $tmp_file
|
31 |
+
echo "pinned: false" >> $tmp_file
|
32 |
+
echo "license: openrail" >> $tmp_file
|
33 |
+
echo "---" >> $tmp_file
|
34 |
+
echo "" >> $tmp_file
|
35 |
+
cat README.md >> $tmp_file
|
36 |
+
mv $tmp_file README.md
|
37 |
+
git add README.md
|
38 |
+
git commit -m "Updated README.md"
|
39 |
+
|
40 |
- name: Push to hub
|
41 |
env:
|
42 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
43 |
+
run: |
|
44 |
+
git push https://zaibutcooler:$HF_TOKEN@huggingface.co/spaces/zaibutcooler/yume --force main
|
sampling.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from .yume import Yume,Config
|
2 |
|
3 |
config = Config()
|
4 |
|
|
|
1 |
+
from .yume import Yume, Config
|
2 |
|
3 |
config = Config()
|
4 |
|
tests/test_datasets.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import unittest
|
2 |
-
from yume.dataset import Trainset
|
3 |
|
4 |
|
5 |
class TestDatasets(unittest.TestCase):
|
@@ -19,5 +19,6 @@ class TestDatasets(unittest.TestCase):
|
|
19 |
encoded_text = trainset.tokenizer.encode(dummy_text)
|
20 |
assert trainset.tokenizer.decode(encoded_text) == dummy_text
|
21 |
|
|
|
22 |
if __name__ == "__main__":
|
23 |
unittest.main()
|
|
|
1 |
import unittest
|
2 |
+
from yume.dataset import Trainset
|
3 |
|
4 |
|
5 |
class TestDatasets(unittest.TestCase):
|
|
|
19 |
encoded_text = trainset.tokenizer.encode(dummy_text)
|
20 |
assert trainset.tokenizer.decode(encoded_text) == dummy_text
|
21 |
|
22 |
+
|
23 |
if __name__ == "__main__":
|
24 |
unittest.main()
|
tests/test_pretrained.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import unittest
|
2 |
-
from yume import Yume,Config
|
3 |
|
4 |
|
5 |
class TestPretrained(unittest.TestCase):
|
@@ -7,7 +7,7 @@ class TestPretrained(unittest.TestCase):
|
|
7 |
super().__init__(methodName)
|
8 |
self.config = Config()
|
9 |
self.yume = Yume(config=self.config)
|
10 |
-
|
11 |
def test_download(self):
|
12 |
self.yume.load_pretrained()
|
13 |
pass
|
|
|
1 |
import unittest
|
2 |
+
from yume import Yume, Config
|
3 |
|
4 |
|
5 |
class TestPretrained(unittest.TestCase):
|
|
|
7 |
super().__init__(methodName)
|
8 |
self.config = Config()
|
9 |
self.yume = Yume(config=self.config)
|
10 |
+
|
11 |
def test_download(self):
|
12 |
self.yume.load_pretrained()
|
13 |
pass
|
tests/test_tokenizer.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import unittest
|
2 |
from yume import Tokenizer
|
3 |
|
|
|
4 |
class TestTokenizer(unittest.TestCase):
|
5 |
def __init__(self, methodName: str = "runTest") -> None:
|
6 |
super().__init__(methodName)
|
7 |
self.tokenizer = Tokenizer()
|
8 |
self.dummy_text = "馬鹿なこと言わないでよ"
|
9 |
-
|
10 |
def test_encode(self):
|
11 |
pass
|
12 |
|
|
|
1 |
import unittest
|
2 |
from yume import Tokenizer
|
3 |
|
4 |
+
|
5 |
class TestTokenizer(unittest.TestCase):
|
6 |
def __init__(self, methodName: str = "runTest") -> None:
|
7 |
super().__init__(methodName)
|
8 |
self.tokenizer = Tokenizer()
|
9 |
self.dummy_text = "馬鹿なこと言わないでよ"
|
10 |
+
|
11 |
def test_encode(self):
|
12 |
pass
|
13 |
|
training.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from .yume import Yume,Trainset,Config
|
2 |
|
3 |
config = Config()
|
4 |
|
@@ -16,6 +16,6 @@ yume.pretrain(dataset.data)
|
|
16 |
|
17 |
yume.sample()
|
18 |
|
19 |
-
#optional
|
20 |
# yume.huggingface_login("your hf tokens")
|
21 |
-
# yume.save_pretrained("yume")
|
|
|
1 |
+
from .yume import Yume, Trainset, Config
|
2 |
|
3 |
config = Config()
|
4 |
|
|
|
16 |
|
17 |
yume.sample()
|
18 |
|
19 |
+
# optional
|
20 |
# yume.huggingface_login("your hf tokens")
|
21 |
+
# yume.save_pretrained("yume")
|
yume/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
from .yume import Yume
|
2 |
from .dataset import Trainset
|
3 |
-
from.tokenizer import Tokenizer
|
4 |
from .config import Config
|
|
|
1 |
from .yume import Yume
|
2 |
from .dataset import Trainset
|
3 |
+
from .tokenizer import Tokenizer
|
4 |
from .config import Config
|
yume/config.py
CHANGED
@@ -9,7 +9,7 @@ class Config:
|
|
9 |
n_embd=768,
|
10 |
dropout=0.0,
|
11 |
bias=True,
|
12 |
-
lr=0.001
|
13 |
) -> None:
|
14 |
self.num_epoch = num_epoch
|
15 |
self.block_sized = 1024
|
|
|
9 |
n_embd=768,
|
10 |
dropout=0.0,
|
11 |
bias=True,
|
12 |
+
lr=0.001,
|
13 |
) -> None:
|
14 |
self.num_epoch = num_epoch
|
15 |
self.block_sized = 1024
|
yume/dataset.py
CHANGED
@@ -14,19 +14,17 @@ class Trainset(Dataset):
|
|
14 |
|
15 |
def __len__(self):
|
16 |
return len(self.data)
|
17 |
-
|
18 |
def __getitem__(self, index):
|
19 |
assert len(self.data) > 10
|
20 |
return []
|
21 |
|
22 |
-
|
23 |
-
def _load_dataset(self,url="zaibutcooler/animanga-vault"):
|
24 |
loaded_dataset = load_dataset(url)
|
25 |
-
self.texts =
|
26 |
-
self.data = self.loaded_data["train"]["data"]
|
27 |
dummy_logger("Successfully loaded the dataset")
|
28 |
-
|
29 |
-
def _tokenize(self,tiktoken=True):
|
30 |
if tiktoken:
|
31 |
enc = tiktoken.get_encoding("cl100k_base")
|
32 |
assert enc.decode(enc.encode("hello world")) == "hello world"
|
@@ -36,4 +34,4 @@ class Trainset(Dataset):
|
|
36 |
else:
|
37 |
self.tokenizer = Tokenizer()
|
38 |
self.tokenizer.load_pretrained()
|
39 |
-
|
|
|
14 |
|
15 |
def __len__(self):
|
16 |
return len(self.data)
|
17 |
+
|
18 |
def __getitem__(self, index):
|
19 |
assert len(self.data) > 10
|
20 |
return []
|
21 |
|
22 |
+
def _load_dataset(self, url="zaibutcooler/animanga-vault"):
|
|
|
23 |
loaded_dataset = load_dataset(url)
|
24 |
+
self.texts = loaded_dataset["animanga"]["texts"]
|
|
|
25 |
dummy_logger("Successfully loaded the dataset")
|
26 |
+
|
27 |
+
def _tokenize(self, tiktoken=True):
|
28 |
if tiktoken:
|
29 |
enc = tiktoken.get_encoding("cl100k_base")
|
30 |
assert enc.decode(enc.encode("hello world")) == "hello world"
|
|
|
34 |
else:
|
35 |
self.tokenizer = Tokenizer()
|
36 |
self.tokenizer.load_pretrained()
|
37 |
+
self.tokenizer.encode(self.texts)
|
yume/yume.py
CHANGED
@@ -18,17 +18,16 @@ class Yume:
|
|
18 |
|
19 |
def generate(self):
|
20 |
pass
|
21 |
-
|
22 |
def sample(self):
|
23 |
pass
|
24 |
|
25 |
-
def pretrain(self,tokens):
|
26 |
lr = self.config.lr
|
27 |
num_epochs = self.config.num_epoch
|
28 |
-
|
29 |
-
|
30 |
pass
|
31 |
-
|
32 |
def fine_tune(self):
|
33 |
pass
|
34 |
|
@@ -38,7 +37,7 @@ class Yume:
|
|
38 |
n_params -= self.transformer.wpe.weight.numel()
|
39 |
dummy_logger(f"parameter count -> {n_params}")
|
40 |
return n_params
|
41 |
-
|
42 |
def save_pretrained(self, name="yume"):
|
43 |
self.model.save_pretrained(name)
|
44 |
self.model.push_to_hub(name)
|
@@ -51,4 +50,4 @@ class Yume:
|
|
51 |
def huggingface_login(self, token):
|
52 |
assert token is not None
|
53 |
login(token=token)
|
54 |
-
dummy_logger("Logged in successfully")
|
|
|
18 |
|
19 |
def generate(self):
|
20 |
pass
|
21 |
+
|
22 |
def sample(self):
|
23 |
pass
|
24 |
|
25 |
+
def pretrain(self, tokens):
|
26 |
lr = self.config.lr
|
27 |
num_epochs = self.config.num_epoch
|
28 |
+
|
|
|
29 |
pass
|
30 |
+
|
31 |
def fine_tune(self):
|
32 |
pass
|
33 |
|
|
|
37 |
n_params -= self.transformer.wpe.weight.numel()
|
38 |
dummy_logger(f"parameter count -> {n_params}")
|
39 |
return n_params
|
40 |
+
|
41 |
def save_pretrained(self, name="yume"):
|
42 |
self.model.save_pretrained(name)
|
43 |
self.model.push_to_hub(name)
|
|
|
50 |
def huggingface_login(self, token):
|
51 |
assert token is not None
|
52 |
login(token=token)
|
53 |
+
dummy_logger("Logged in successfully")
|